Total coverage: 187916 (16%)of 1224948
1 1 1 6 6 1 1 4 4 1 1 1 1 1 1 1 6 6 5 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 104 103 105 105 105 105 105 103 102 104 104 105 102 104 104 102 103 99 105 104 105 104 3 105 4 104 101 66 61 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 /* * Created: Fri Jan 8 09:01:26 1999 by faith@valinux.com * * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * All Rights Reserved. * * Author Rickard E. (Rik) Faith <faith@valinux.com> * Author Gareth Hughes <gareth@valinux.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <linux/export.h> #include <linux/nospec.h> #include <linux/pci.h> #include <linux/uaccess.h> #include <drm/drm_auth.h> #include <drm/drm_crtc.h> #include <drm/drm_drv.h> #include <drm/drm_file.h> #include <drm/drm_ioctl.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" #include "drm_internal.h" /** * DOC: getunique and setversion story * * BEWARE THE DRAGONS! MIND THE TRAPDOORS! * * In an attempt to warn anyone else who's trying to figure out what's going * on here, I'll try to summarize the story. First things first, let's clear up * the names, because the kernel internals, libdrm and the ioctls are all named * differently: * * - GET_UNIQUE ioctl, implemented by drm_getunique is wrapped up in libdrm * through the drmGetBusid function. * - The libdrm drmSetBusid function is backed by the SET_UNIQUE ioctl. All * that code is nerved in the kernel with drm_invalid_op(). * - The internal set_busid kernel functions and driver callbacks are * exclusively use by the SET_VERSION ioctl, because only drm 1.0 (which is * nerved) allowed userspace to set the busid through the above ioctl. * - Other ioctls and functions involved are named consistently. * * For anyone wondering what's the difference between drm 1.1 and 1.4: Correctly * handling pci domains in the busid on ppc. Doing this correctly was only * implemented in libdrm in 2010, hence can't be nerved yet. No one knows what's * special with drm 1.2 and 1.3. * * Now the actual horror story of how device lookup in drm works. At large, * there's 2 different ways, either by busid, or by device driver name. * * Opening by busid is fairly simple: * * 1. First call SET_VERSION to make sure pci domains are handled properly. As a * side-effect this fills out the unique name in the master structure. * 2. Call GET_UNIQUE to read out the unique name from the master structure, * which matches the busid thanks to step 1. If it doesn't, proceed to try * the next device node. * * Opening by name is slightly different: * * 1. Directly call VERSION to get the version and to match against the driver * name returned by that ioctl. Note that SET_VERSION is not called, which * means the unique name for the master node just opening is _not_ filled * out. This despite that with current drm device nodes are always bound to * one device, and can't be runtime assigned like with drm 1.0. * 2. Match driver name. If it mismatches, proceed to the next device node. * 3. Call GET_UNIQUE, and check whether the unique name has length zero (by * checking that the first byte in the string is 0). If that's not the case * libdrm skips and proceeds to the next device node. Probably this is just * copypasta from drm 1.0 times where a set unique name meant that the driver * was in use already, but that's just conjecture. * * Long story short: To keep the open by name logic working, GET_UNIQUE must * _not_ return a unique string when SET_VERSION hasn't been called yet, * otherwise libdrm breaks. Even when that unique string can't ever change, and * is totally irrelevant for actually opening the device because runtime * assignable device instances were only support in drm 1.0, which is long dead. * But the libdrm code in drmOpenByName somehow survived, hence this can't be * broken. */ /* * Get the bus id. * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg user argument, pointing to a drm_unique structure. * \return zero on success or a negative number on failure. * * Copies the bus id from drm_device::unique into user space. */ int drm_getunique(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_unique *u = data; struct drm_master *master; mutex_lock(&dev->master_mutex); master = file_priv->master; if (u->unique_len >= master->unique_len) { if (copy_to_user(u->unique, master->unique, master->unique_len)) { mutex_unlock(&dev->master_mutex); return -EFAULT; } } u->unique_len = master->unique_len; mutex_unlock(&dev->master_mutex); return 0; } static void drm_unset_busid(struct drm_device *dev, struct drm_master *master) { kfree(master->unique); master->unique = NULL; master->unique_len = 0; } static int drm_set_busid(struct drm_device *dev, struct drm_file *file_priv) { struct drm_master *master = file_priv->master; int ret; if (master->unique != NULL) drm_unset_busid(dev, master); if (dev->dev && dev_is_pci(dev->dev)) { ret = drm_pci_set_busid(dev, master); if (ret) { drm_unset_busid(dev, master); return ret; } } else { WARN_ON(!dev->unique); master->unique = kstrdup(dev->unique, GFP_KERNEL); if (master->unique) master->unique_len = strlen(dev->unique); } return 0; } /* * Get client information. * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg user argument, pointing to a drm_client structure. * * \return zero on success or a negative number on failure. * * Searches for the client with the specified index and copies its information * into userspace */ int drm_getclient(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_client *client = data; /* * Hollowed-out getclient ioctl to keep some dead old drm tests/tools * not breaking completely. Userspace tools stop enumerating one they * get -EINVAL, hence this is the return value we need to hand back for * no clients tracked. * * Unfortunately some clients (*cough* libva *cough*) use this in a fun * attempt to figure out whether they're authenticated or not. Since * that's the only thing they care about, give it to the directly * instead of walking one giant list. */ if (client->idx == 0) { client->auth = file_priv->authenticated; client->pid = task_pid_vnr(current); client->uid = overflowuid; client->magic = 0; client->iocs = 0; return 0; } else { return -EINVAL; } } /* * Get statistics information. * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg user argument, pointing to a drm_stats structure. * * \return zero on success or a negative number on failure. */ static int drm_getstats(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_stats *stats = data; /* Clear stats to prevent userspace from eating its stack garbage. */ memset(stats, 0, sizeof(*stats)); return 0; } /* * Get device/driver capabilities */ static int drm_getcap(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_get_cap *req = data; struct drm_crtc *crtc; req->value = 0; /* Only some caps make sense with UMS/render-only drivers. */ switch (req->capability) { case DRM_CAP_TIMESTAMP_MONOTONIC: req->value = 1; return 0; case DRM_CAP_PRIME: req->value = DRM_PRIME_CAP_IMPORT | DRM_PRIME_CAP_EXPORT; return 0; case DRM_CAP_SYNCOBJ: req->value = drm_core_check_feature(dev, DRIVER_SYNCOBJ); return 0; case DRM_CAP_SYNCOBJ_TIMELINE: req->value = drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE); return 0; } /* Other caps only work with KMS drivers */ if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; switch (req->capability) { case DRM_CAP_DUMB_BUFFER: if (dev->driver->dumb_create) req->value = 1; break; case DRM_CAP_VBLANK_HIGH_CRTC: req->value = 1; break; case DRM_CAP_DUMB_PREFERRED_DEPTH: req->value = dev->mode_config.preferred_depth; break; case DRM_CAP_DUMB_PREFER_SHADOW: req->value = dev->mode_config.prefer_shadow; break; case DRM_CAP_ASYNC_PAGE_FLIP: req->value = dev->mode_config.async_page_flip; break; case DRM_CAP_PAGE_FLIP_TARGET: req->value = 1; drm_for_each_crtc(crtc, dev) { if (!crtc->funcs->page_flip_target) req->value = 0; } break; case DRM_CAP_CURSOR_WIDTH: if (dev->mode_config.cursor_width) req->value = dev->mode_config.cursor_width; else req->value = 64; break; case DRM_CAP_CURSOR_HEIGHT: if (dev->mode_config.cursor_height) req->value = dev->mode_config.cursor_height; else req->value = 64; break; case DRM_CAP_ADDFB2_MODIFIERS: req->value = !dev->mode_config.fb_modifiers_not_supported; break; case DRM_CAP_CRTC_IN_VBLANK_EVENT: req->value = 1; break; case DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP: req->value = drm_core_check_feature(dev, DRIVER_ATOMIC) && dev->mode_config.async_page_flip; break; default: return -EINVAL; } return 0; } /* * Set device/driver capabilities */ static int drm_setclientcap(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_set_client_cap *req = data; /* No render-only settable capabilities for now */ /* Below caps that only works with KMS drivers */ if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; switch (req->capability) { case DRM_CLIENT_CAP_STEREO_3D: if (req->value > 1) return -EINVAL; file_priv->stereo_allowed = req->value; break; case DRM_CLIENT_CAP_UNIVERSAL_PLANES: if (req->value > 1) return -EINVAL; file_priv->universal_planes = req->value; break; case DRM_CLIENT_CAP_ATOMIC: if (!drm_core_check_feature(dev, DRIVER_ATOMIC)) return -EOPNOTSUPP; /* The modesetting DDX has a totally broken idea of atomic. */ if (current->comm[0] == 'X' && req->value == 1) { pr_info("broken atomic modeset userspace detected, disabling atomic\n"); return -EOPNOTSUPP; } if (req->value > 2) return -EINVAL; file_priv->atomic = req->value; file_priv->universal_planes = req->value; /* * No atomic user-space blows up on aspect ratio mode bits. */ file_priv->aspect_ratio_allowed = req->value; break; case DRM_CLIENT_CAP_ASPECT_RATIO: if (req->value > 1) return -EINVAL; file_priv->aspect_ratio_allowed = req->value; break; case DRM_CLIENT_CAP_WRITEBACK_CONNECTORS: if (!file_priv->atomic) return -EINVAL; if (req->value > 1) return -EINVAL; file_priv->writeback_connectors = req->value; break; case DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT: if (!drm_core_check_feature(dev, DRIVER_CURSOR_HOTSPOT)) return -EOPNOTSUPP; if (!file_priv->atomic) return -EINVAL; if (req->value > 1) return -EINVAL; file_priv->supports_virtualized_cursor_plane = req->value; break; default: return -EINVAL; } return 0; } /* * Setversion ioctl. * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg user argument, pointing to a drm_lock structure. * \return zero on success or negative number on failure. * * Sets the requested interface version */ static int drm_setversion(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_set_version *sv = data; int if_version, retcode = 0; mutex_lock(&dev->master_mutex); if (sv->drm_di_major != -1) { if (sv->drm_di_major != DRM_IF_MAJOR || sv->drm_di_minor < 0 || sv->drm_di_minor > DRM_IF_MINOR) { retcode = -EINVAL; goto done; } if_version = DRM_IF_VERSION(sv->drm_di_major, sv->drm_di_minor); dev->if_version = max(if_version, dev->if_version); if (sv->drm_di_minor >= 1) { /* * Version 1.1 includes tying of DRM to specific device * Version 1.4 has proper PCI domain support */ retcode = drm_set_busid(dev, file_priv); if (retcode) goto done; } } if (sv->drm_dd_major != -1) { if (sv->drm_dd_major != dev->driver->major || sv->drm_dd_minor < 0 || sv->drm_dd_minor > dev->driver->minor) { retcode = -EINVAL; goto done; } } done: sv->drm_di_major = DRM_IF_MAJOR; sv->drm_di_minor = DRM_IF_MINOR; sv->drm_dd_major = dev->driver->major; sv->drm_dd_minor = dev->driver->minor; mutex_unlock(&dev->master_mutex); return retcode; } /** * drm_noop - DRM no-op ioctl implementation * @dev: DRM device for the ioctl * @data: data pointer for the ioctl * @file_priv: DRM file for the ioctl call * * This no-op implementation for drm ioctls is useful for deprecated * functionality where we can't return a failure code because existing userspace * checks the result of the ioctl, but doesn't care about the action. * * Always returns successfully with 0. */ int drm_noop(struct drm_device *dev, void *data, struct drm_file *file_priv) { drm_dbg_core(dev, "\n"); return 0; } EXPORT_SYMBOL(drm_noop); /** * drm_invalid_op - DRM invalid ioctl implementation * @dev: DRM device for the ioctl * @data: data pointer for the ioctl * @file_priv: DRM file for the ioctl call * * This no-op implementation for drm ioctls is useful for deprecated * functionality where we really don't want to allow userspace to call the ioctl * any more. This is the case for old ums interfaces for drivers that * transitioned to kms gradually and so kept the old legacy tables around. This * only applies to radeon and i915 kms drivers, other drivers shouldn't need to * use this function. * * Always fails with a return value of -EINVAL. */ int drm_invalid_op(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -EINVAL; } EXPORT_SYMBOL(drm_invalid_op); /* * Copy and IOCTL return string to user space */ static int drm_copy_field(char __user *buf, size_t *buf_len, const char *value) { size_t len; /* don't attempt to copy a NULL pointer */ if (WARN_ONCE(!value, "BUG: the value to copy was not set!")) { *buf_len = 0; return 0; } /* don't overflow userbuf */ len = strlen(value); if (len > *buf_len) len = *buf_len; /* let userspace know exact length of driver value (which could be * larger than the userspace-supplied buffer) */ *buf_len = strlen(value); /* finally, try filling in the userbuf */ if (len && buf) if (copy_to_user(buf, value, len)) return -EFAULT; return 0; } /* * Get version information * * \param inode device inode. * \param filp file pointer. * \param cmd command. * \param arg user argument, pointing to a drm_version structure. * \return zero on success or negative number on failure. * * Fills in the version information in \p arg. */ int drm_version(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_version *version = data; int err; version->version_major = dev->driver->major; version->version_minor = dev->driver->minor; version->version_patchlevel = dev->driver->patchlevel; err = drm_copy_field(version->name, &version->name_len, dev->driver->name); if (!err) err = drm_copy_field(version->date, &version->date_len, dev->driver->date); if (!err) err = drm_copy_field(version->desc, &version->desc_len, dev->driver->desc); return err; } static int drm_ioctl_permit(u32 flags, struct drm_file *file_priv) { /* ROOT_ONLY is only for CAP_SYS_ADMIN */ if (unlikely((flags & DRM_ROOT_ONLY) && !capable(CAP_SYS_ADMIN))) return -EACCES; /* AUTH is only for authenticated or render client */ if (unlikely((flags & DRM_AUTH) && !drm_is_render_client(file_priv) && !file_priv->authenticated)) return -EACCES; /* MASTER is only for master or control clients */ if (unlikely((flags & DRM_MASTER) && !drm_is_current_master(file_priv))) return -EACCES; /* Render clients must be explicitly allowed */ if (unlikely(!(flags & DRM_RENDER_ALLOW) && drm_is_render_client(file_priv))) return -EACCES; return 0; } #define DRM_IOCTL_DEF(ioctl, _func, _flags) \ [DRM_IOCTL_NR(ioctl)] = { \ .cmd = ioctl, \ .func = _func, \ .flags = _flags, \ .name = #ioctl \ } /* Ioctl table */ static const struct drm_ioctl_desc drm_ioctls[] = { DRM_IOCTL_DEF(DRM_IOCTL_VERSION, drm_version, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_GET_UNIQUE, drm_getunique, 0), DRM_IOCTL_DEF(DRM_IOCTL_GET_MAGIC, drm_getmagic, 0), DRM_IOCTL_DEF(DRM_IOCTL_GET_CLIENT, drm_getclient, 0), DRM_IOCTL_DEF(DRM_IOCTL_GET_STATS, drm_getstats, 0), DRM_IOCTL_DEF(DRM_IOCTL_GET_CAP, drm_getcap, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SET_CLIENT_CAP, drm_setclientcap, 0), DRM_IOCTL_DEF(DRM_IOCTL_SET_VERSION, drm_setversion, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_SET_UNIQUE, drm_invalid_op, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), DRM_IOCTL_DEF(DRM_IOCTL_BLOCK, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), DRM_IOCTL_DEF(DRM_IOCTL_UNBLOCK, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), DRM_IOCTL_DEF(DRM_IOCTL_AUTH_MAGIC, drm_authmagic, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_SET_MASTER, drm_setmaster_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_DROP_MASTER, drm_dropmaster_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_ADD_DRAW, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), DRM_IOCTL_DEF(DRM_IOCTL_RM_DRAW, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), DRM_IOCTL_DEF(DRM_IOCTL_FINISH, drm_noop, DRM_AUTH), DRM_IOCTL_DEF(DRM_IOCTL_WAIT_VBLANK, drm_wait_vblank_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_UPDATE_DRAW, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), DRM_IOCTL_DEF(DRM_IOCTL_GEM_CLOSE, drm_gem_close_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_GEM_FLINK, drm_gem_flink_ioctl, DRM_AUTH), DRM_IOCTL_DEF(DRM_IOCTL_GEM_OPEN, drm_gem_open_ioctl, DRM_AUTH), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETRESOURCES, drm_mode_getresources, 0), DRM_IOCTL_DEF(DRM_IOCTL_PRIME_HANDLE_TO_FD, drm_prime_handle_to_fd_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_PRIME_FD_TO_HANDLE, drm_prime_fd_to_handle_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETPLANERESOURCES, drm_mode_getplane_res, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETCRTC, drm_mode_getcrtc, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_SETCRTC, drm_mode_setcrtc, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETPLANE, drm_mode_getplane, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_SETPLANE, drm_mode_setplane, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_CURSOR, drm_mode_cursor_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETGAMMA, drm_mode_gamma_get_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_SETGAMMA, drm_mode_gamma_set_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETENCODER, drm_mode_getencoder, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETCONNECTOR, drm_mode_getconnector, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_ATTACHMODE, drm_noop, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_DETACHMODE, drm_noop, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETPROPERTY, drm_mode_getproperty_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_SETPROPERTY, drm_connector_property_set_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETPROPBLOB, drm_mode_getblob_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETFB, drm_mode_getfb, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETFB2, drm_mode_getfb2_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_ADDFB, drm_mode_addfb_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_ADDFB2, drm_mode_addfb2_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_RMFB, drm_mode_rmfb_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_CLOSEFB, drm_mode_closefb_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_PAGE_FLIP, drm_mode_page_flip_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_DIRTYFB, drm_mode_dirtyfb_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_CREATE_DUMB, drm_mode_create_dumb_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_MAP_DUMB, drm_mode_mmap_dumb_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_DESTROY_DUMB, drm_mode_destroy_dumb_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_OBJ_GETPROPERTIES, drm_mode_obj_get_properties_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_OBJ_SETPROPERTY, drm_mode_obj_set_property_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_CURSOR2, drm_mode_cursor2_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_ATOMIC, drm_mode_atomic_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_CREATEPROPBLOB, drm_mode_createblob_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_DESTROYPROPBLOB, drm_mode_destroyblob_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_CREATE, drm_syncobj_create_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_DESTROY, drm_syncobj_destroy_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, drm_syncobj_handle_to_fd_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, drm_syncobj_fd_to_handle_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_TRANSFER, drm_syncobj_transfer_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_WAIT, drm_syncobj_wait_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, drm_syncobj_timeline_wait_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_EVENTFD, drm_syncobj_eventfd_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_RESET, drm_syncobj_reset_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_SIGNAL, drm_syncobj_signal_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL, drm_syncobj_timeline_signal_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_QUERY, drm_syncobj_query_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_CRTC_GET_SEQUENCE, drm_crtc_get_sequence_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_CRTC_QUEUE_SEQUENCE, drm_crtc_queue_sequence_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_CREATE_LEASE, drm_mode_create_lease_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_LIST_LESSEES, drm_mode_list_lessees_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GET_LEASE, drm_mode_get_lease_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_REVOKE_LEASE, drm_mode_revoke_lease_ioctl, DRM_MASTER), }; #define DRM_CORE_IOCTL_COUNT ARRAY_SIZE(drm_ioctls) /** * DOC: driver specific ioctls * * First things first, driver private IOCTLs should only be needed for drivers * supporting rendering. Kernel modesetting is all standardized, and extended * through properties. There are a few exceptions in some existing drivers, * which define IOCTL for use by the display DRM master, but they all predate * properties. * * Now if you do have a render driver you always have to support it through * driver private properties. There's a few steps needed to wire all the things * up. * * First you need to define the structure for your IOCTL in your driver private * UAPI header in ``include/uapi/drm/my_driver_drm.h``:: * * struct my_driver_operation { * u32 some_thing; * u32 another_thing; * }; * * Please make sure that you follow all the best practices from * ``Documentation/process/botching-up-ioctls.rst``. Note that drm_ioctl() * automatically zero-extends structures, hence make sure you can add more stuff * at the end, i.e. don't put a variable sized array there. * * Then you need to define your IOCTL number, using one of DRM_IO(), DRM_IOR(), * DRM_IOW() or DRM_IOWR(). It must start with the DRM_IOCTL\_ prefix:: * * ##define DRM_IOCTL_MY_DRIVER_OPERATION \ * DRM_IOW(DRM_COMMAND_BASE, struct my_driver_operation) * * DRM driver private IOCTL must be in the range from DRM_COMMAND_BASE to * DRM_COMMAND_END. Finally you need an array of &struct drm_ioctl_desc to wire * up the handlers and set the access rights:: * * static const struct drm_ioctl_desc my_driver_ioctls[] = { * DRM_IOCTL_DEF_DRV(MY_DRIVER_OPERATION, my_driver_operation, * DRM_AUTH|DRM_RENDER_ALLOW), * }; * * And then assign this to the &drm_driver.ioctls field in your driver * structure. * * See the separate chapter on :ref:`file operations<drm_driver_fops>` for how * the driver-specific IOCTLs are wired up. */ long drm_ioctl_kernel(struct file *file, drm_ioctl_t *func, void *kdata, u32 flags) { struct drm_file *file_priv = file->private_data; struct drm_device *dev = file_priv->minor->dev; int ret; /* Update drm_file owner if fd was passed along. */ drm_file_update_pid(file_priv); if (drm_dev_is_unplugged(dev)) return -ENODEV; ret = drm_ioctl_permit(flags, file_priv); if (unlikely(ret)) return ret; return func(dev, kdata, file_priv); } EXPORT_SYMBOL(drm_ioctl_kernel); /** * drm_ioctl - ioctl callback implementation for DRM drivers * @filp: file this ioctl is called on * @cmd: ioctl cmd number * @arg: user argument * * Looks up the ioctl function in the DRM core and the driver dispatch table, * stored in &drm_driver.ioctls. It checks for necessary permission by calling * drm_ioctl_permit(), and dispatches to the respective function. * * Returns: * Zero on success, negative error code on failure. */ long drm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct drm_file *file_priv = filp->private_data; struct drm_device *dev; const struct drm_ioctl_desc *ioctl = NULL; drm_ioctl_t *func; unsigned int nr = DRM_IOCTL_NR(cmd); int retcode = -EINVAL; char stack_kdata[128]; char *kdata = NULL; unsigned int in_size, out_size, drv_size, ksize; bool is_driver_ioctl; dev = file_priv->minor->dev; if (drm_dev_is_unplugged(dev)) return -ENODEV; if (DRM_IOCTL_TYPE(cmd) != DRM_IOCTL_BASE) return -ENOTTY; is_driver_ioctl = nr >= DRM_COMMAND_BASE && nr < DRM_COMMAND_END; if (is_driver_ioctl) { /* driver ioctl */ unsigned int index = nr - DRM_COMMAND_BASE; if (index >= dev->driver->num_ioctls) goto err_i1; index = array_index_nospec(index, dev->driver->num_ioctls); ioctl = &dev->driver->ioctls[index]; } else { /* core ioctl */ if (nr >= DRM_CORE_IOCTL_COUNT) goto err_i1; nr = array_index_nospec(nr, DRM_CORE_IOCTL_COUNT); ioctl = &drm_ioctls[nr]; } drv_size = _IOC_SIZE(ioctl->cmd); out_size = in_size = _IOC_SIZE(cmd); if ((cmd & ioctl->cmd & IOC_IN) == 0) in_size = 0; if ((cmd & ioctl->cmd & IOC_OUT) == 0) out_size = 0; ksize = max(max(in_size, out_size), drv_size); drm_dbg_core(dev, "comm=\"%s\" pid=%d, dev=0x%lx, auth=%d, %s\n", current->comm, task_pid_nr(current), (long)old_encode_dev(file_priv->minor->kdev->devt), file_priv->authenticated, ioctl->name); /* Do not trust userspace, use our own definition */ func = ioctl->func; if (unlikely(!func)) { drm_dbg_core(dev, "no function\n"); retcode = -EINVAL; goto err_i1; } if (ksize <= sizeof(stack_kdata)) { kdata = stack_kdata; } else { kdata = kmalloc(ksize, GFP_KERNEL); if (!kdata) { retcode = -ENOMEM; goto err_i1; } } if (copy_from_user(kdata, (void __user *)arg, in_size) != 0) { retcode = -EFAULT; goto err_i1; } if (ksize > in_size) memset(kdata + in_size, 0, ksize - in_size); retcode = drm_ioctl_kernel(filp, func, kdata, ioctl->flags); if (copy_to_user((void __user *)arg, kdata, out_size) != 0) retcode = -EFAULT; err_i1: if (!ioctl) drm_dbg_core(dev, "invalid ioctl: comm=\"%s\", pid=%d, dev=0x%lx, auth=%d, cmd=0x%02x, nr=0x%02x\n", current->comm, task_pid_nr(current), (long)old_encode_dev(file_priv->minor->kdev->devt), file_priv->authenticated, cmd, nr); if (kdata != stack_kdata) kfree(kdata); if (retcode) drm_dbg_core(dev, "comm=\"%s\", pid=%d, ret=%d\n", current->comm, task_pid_nr(current), retcode); return retcode; } EXPORT_SYMBOL(drm_ioctl); /** * drm_ioctl_flags - Check for core ioctl and return ioctl permission flags * @nr: ioctl number * @flags: where to return the ioctl permission flags * * This ioctl is only used by the vmwgfx driver to augment the access checks * done by the drm core and insofar a pretty decent layering violation. This * shouldn't be used by any drivers. * * Returns: * True if the @nr corresponds to a DRM core ioctl number, false otherwise. */ bool drm_ioctl_flags(unsigned int nr, unsigned int *flags) { if (nr >= DRM_COMMAND_BASE && nr < DRM_COMMAND_END) return false; if (nr >= DRM_CORE_IOCTL_COUNT) return false; nr = array_index_nospec(nr, DRM_CORE_IOCTL_COUNT); *flags = drm_ioctls[nr].flags; return true; } EXPORT_SYMBOL(drm_ioctl_flags);
5 5 10 6 8 17 12 10 10 10 10 10 10 10 6 12 6 10 6 5 5 3 3 3 3 4 4 1 4 4 4 3 6 4 6 10 10 10 10 6 6 2 10 12 12 12 12 12 12 11 12 12 6 6 12 12 12 2 3 2 2 2 3 1 1 1 1 1 1 1 1 1 1 5 5 5 5 3 2 5 3 2 3 2 1 3 2 3 3 3 5 1 1 1 1 1 1 1 5 5 5 5 1 1 1 4 10 20 20 20 19 1 1 1 1 1 1 1 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 // SPDX-License-Identifier: GPL-2.0-or-later /* * Loopback soundcard * * Original code: * Copyright (c) by Jaroslav Kysela <perex@perex.cz> * * More accurate positioning and full-duplex support: * Copyright (c) Ahmet İnan <ainan at mathematik.uni-freiburg.de> * * Major (almost complete) rewrite: * Copyright (c) by Takashi Iwai <tiwai@suse.de> * * A next major update in 2010 (separate timers for playback and capture): * Copyright (c) Jaroslav Kysela <perex@perex.cz> */ #include <linux/init.h> #include <linux/jiffies.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/wait.h> #include <linux/module.h> #include <linux/platform_device.h> #include <sound/core.h> #include <sound/control.h> #include <sound/pcm.h> #include <sound/pcm_params.h> #include <sound/info.h> #include <sound/initval.h> #include <sound/timer.h> MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>"); MODULE_DESCRIPTION("A loopback soundcard"); MODULE_LICENSE("GPL"); #define MAX_PCM_SUBSTREAMS 8 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX; /* Index 0-MAX */ static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR; /* ID for this card */ static bool enable[SNDRV_CARDS] = {1, [1 ... (SNDRV_CARDS - 1)] = 0}; static int pcm_substreams[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS - 1)] = 8}; static int pcm_notify[SNDRV_CARDS]; static char *timer_source[SNDRV_CARDS]; module_param_array(index, int, NULL, 0444); MODULE_PARM_DESC(index, "Index value for loopback soundcard."); module_param_array(id, charp, NULL, 0444); MODULE_PARM_DESC(id, "ID string for loopback soundcard."); module_param_array(enable, bool, NULL, 0444); MODULE_PARM_DESC(enable, "Enable this loopback soundcard."); module_param_array(pcm_substreams, int, NULL, 0444); MODULE_PARM_DESC(pcm_substreams, "PCM substreams # (1-8) for loopback driver."); module_param_array(pcm_notify, int, NULL, 0444); MODULE_PARM_DESC(pcm_notify, "Break capture when PCM format/rate/channels changes."); module_param_array(timer_source, charp, NULL, 0444); MODULE_PARM_DESC(timer_source, "Sound card name or number and device/subdevice number of timer to be used. Empty string for jiffies timer [default]."); #define NO_PITCH 100000 #define CABLE_VALID_PLAYBACK BIT(SNDRV_PCM_STREAM_PLAYBACK) #define CABLE_VALID_CAPTURE BIT(SNDRV_PCM_STREAM_CAPTURE) #define CABLE_VALID_BOTH (CABLE_VALID_PLAYBACK | CABLE_VALID_CAPTURE) struct loopback_cable; struct loopback_pcm; struct loopback_ops { /* optional * call in loopback->cable_lock */ int (*open)(struct loopback_pcm *dpcm); /* required * call in cable->lock */ int (*start)(struct loopback_pcm *dpcm); /* required * call in cable->lock */ int (*stop)(struct loopback_pcm *dpcm); /* optional */ int (*stop_sync)(struct loopback_pcm *dpcm); /* optional */ int (*close_substream)(struct loopback_pcm *dpcm); /* optional * call in loopback->cable_lock */ int (*close_cable)(struct loopback_pcm *dpcm); /* optional * call in cable->lock */ unsigned int (*pos_update)(struct loopback_cable *cable); /* optional */ void (*dpcm_info)(struct loopback_pcm *dpcm, struct snd_info_buffer *buffer); }; struct loopback_cable { spinlock_t lock; struct loopback_pcm *streams[2]; struct snd_pcm_hardware hw; /* flags */ unsigned int valid; unsigned int running; unsigned int pause; /* timer specific */ const struct loopback_ops *ops; /* If sound timer is used */ struct { int stream; struct snd_timer_id id; struct work_struct event_work; struct snd_timer_instance *instance; } snd_timer; }; struct loopback_setup { unsigned int notify: 1; unsigned int rate_shift; snd_pcm_format_t format; unsigned int rate; snd_pcm_access_t access; unsigned int channels; struct snd_ctl_elem_id active_id; struct snd_ctl_elem_id format_id; struct snd_ctl_elem_id rate_id; struct snd_ctl_elem_id channels_id; struct snd_ctl_elem_id access_id; }; struct loopback { struct snd_card *card; struct mutex cable_lock; struct loopback_cable *cables[MAX_PCM_SUBSTREAMS][2]; struct snd_pcm *pcm[2]; struct loopback_setup setup[MAX_PCM_SUBSTREAMS][2]; const char *timer_source; }; struct loopback_pcm { struct loopback *loopback; struct snd_pcm_substream *substream; struct loopback_cable *cable; unsigned int pcm_buffer_size; unsigned int buf_pos; /* position in buffer */ unsigned int silent_size; /* PCM parameters */ unsigned int pcm_period_size; unsigned int pcm_bps; /* bytes per second */ unsigned int pcm_salign; /* bytes per sample * channels */ unsigned int pcm_rate_shift; /* rate shift value */ /* flags */ unsigned int period_update_pending :1; /* timer stuff */ unsigned int irq_pos; /* fractional IRQ position in jiffies * ticks */ unsigned int period_size_frac; /* period size in jiffies ticks */ unsigned int last_drift; unsigned long last_jiffies; /* If jiffies timer is used */ struct timer_list timer; /* size of per channel buffer in case of non-interleaved access */ unsigned int channel_buf_n; }; static struct platform_device *devices[SNDRV_CARDS]; static inline unsigned int byte_pos(struct loopback_pcm *dpcm, unsigned int x) { if (dpcm->pcm_rate_shift == NO_PITCH) { x /= HZ; } else { x = div_u64(NO_PITCH * (unsigned long long)x, HZ * (unsigned long long)dpcm->pcm_rate_shift); } return x - (x % dpcm->pcm_salign); } static inline unsigned int frac_pos(struct loopback_pcm *dpcm, unsigned int x) { if (dpcm->pcm_rate_shift == NO_PITCH) { /* no pitch */ return x * HZ; } else { x = div_u64(dpcm->pcm_rate_shift * (unsigned long long)x * HZ, NO_PITCH); } return x; } static inline struct loopback_setup *get_setup(struct loopback_pcm *dpcm) { int device = dpcm->substream->pstr->pcm->device; if (dpcm->substream->stream == SNDRV_PCM_STREAM_PLAYBACK) device ^= 1; return &dpcm->loopback->setup[dpcm->substream->number][device]; } static inline unsigned int get_notify(struct loopback_pcm *dpcm) { return get_setup(dpcm)->notify; } static inline unsigned int get_rate_shift(struct loopback_pcm *dpcm) { return get_setup(dpcm)->rate_shift; } /* call in cable->lock */ static int loopback_jiffies_timer_start(struct loopback_pcm *dpcm) { unsigned long tick; unsigned int rate_shift = get_rate_shift(dpcm); if (rate_shift != dpcm->pcm_rate_shift) { dpcm->pcm_rate_shift = rate_shift; dpcm->period_size_frac = frac_pos(dpcm, dpcm->pcm_period_size); } if (dpcm->period_size_frac <= dpcm->irq_pos) { dpcm->irq_pos %= dpcm->period_size_frac; dpcm->period_update_pending = 1; } tick = dpcm->period_size_frac - dpcm->irq_pos; tick = DIV_ROUND_UP(tick, dpcm->pcm_bps); mod_timer(&dpcm->timer, jiffies + tick); return 0; } /* call in cable->lock */ static int loopback_snd_timer_start(struct loopback_pcm *dpcm) { struct loopback_cable *cable = dpcm->cable; int err; /* Loopback device has to use same period as timer card. Therefore * wake up for each snd_pcm_period_elapsed() call of timer card. */ err = snd_timer_start(cable->snd_timer.instance, 1); if (err < 0) { /* do not report error if trying to start but already * running. For example called by opposite substream * of the same cable */ if (err == -EBUSY) return 0; pcm_err(dpcm->substream->pcm, "snd_timer_start(%d,%d,%d) failed with %d", cable->snd_timer.id.card, cable->snd_timer.id.device, cable->snd_timer.id.subdevice, err); } return err; } /* call in cable->lock */ static inline int loopback_jiffies_timer_stop(struct loopback_pcm *dpcm) { del_timer(&dpcm->timer); dpcm->timer.expires = 0; return 0; } /* call in cable->lock */ static int loopback_snd_timer_stop(struct loopback_pcm *dpcm) { struct loopback_cable *cable = dpcm->cable; int err; /* only stop if both devices (playback and capture) are not running */ if (cable->running ^ cable->pause) return 0; err = snd_timer_stop(cable->snd_timer.instance); if (err < 0) { pcm_err(dpcm->substream->pcm, "snd_timer_stop(%d,%d,%d) failed with %d", cable->snd_timer.id.card, cable->snd_timer.id.device, cable->snd_timer.id.subdevice, err); } return err; } static inline int loopback_jiffies_timer_stop_sync(struct loopback_pcm *dpcm) { del_timer_sync(&dpcm->timer); return 0; } /* call in loopback->cable_lock */ static int loopback_snd_timer_close_cable(struct loopback_pcm *dpcm) { struct loopback_cable *cable = dpcm->cable; /* snd_timer was not opened */ if (!cable->snd_timer.instance) return 0; /* will only be called from free_cable() when other stream was * already closed. Other stream cannot be reopened as long as * loopback->cable_lock is locked. Therefore no need to lock * cable->lock; */ snd_timer_close(cable->snd_timer.instance); /* wait till drain work has finished if requested */ cancel_work_sync(&cable->snd_timer.event_work); snd_timer_instance_free(cable->snd_timer.instance); memset(&cable->snd_timer, 0, sizeof(cable->snd_timer)); return 0; } static bool is_access_interleaved(snd_pcm_access_t access) { switch (access) { case SNDRV_PCM_ACCESS_MMAP_INTERLEAVED: case SNDRV_PCM_ACCESS_RW_INTERLEAVED: return true; default: return false; } }; static int loopback_check_format(struct loopback_cable *cable, int stream) { struct snd_pcm_runtime *runtime, *cruntime; struct loopback_setup *setup; struct snd_card *card; int check; if (cable->valid != CABLE_VALID_BOTH) { if (stream == SNDRV_PCM_STREAM_PLAYBACK) goto __notify; return 0; } runtime = cable->streams[SNDRV_PCM_STREAM_PLAYBACK]-> substream->runtime; cruntime = cable->streams[SNDRV_PCM_STREAM_CAPTURE]-> substream->runtime; check = runtime->format != cruntime->format || runtime->rate != cruntime->rate || runtime->channels != cruntime->channels || is_access_interleaved(runtime->access) != is_access_interleaved(cruntime->access); if (!check) return 0; if (stream == SNDRV_PCM_STREAM_CAPTURE) { return -EIO; } else { snd_pcm_stop(cable->streams[SNDRV_PCM_STREAM_CAPTURE]-> substream, SNDRV_PCM_STATE_DRAINING); __notify: runtime = cable->streams[SNDRV_PCM_STREAM_PLAYBACK]-> substream->runtime; setup = get_setup(cable->streams[SNDRV_PCM_STREAM_PLAYBACK]); card = cable->streams[SNDRV_PCM_STREAM_PLAYBACK]->loopback->card; if (setup->format != runtime->format) { snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &setup->format_id); setup->format = runtime->format; } if (setup->rate != runtime->rate) { snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &setup->rate_id); setup->rate = runtime->rate; } if (setup->channels != runtime->channels) { snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &setup->channels_id); setup->channels = runtime->channels; } if (is_access_interleaved(setup->access) != is_access_interleaved(runtime->access)) { snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &setup->access_id); setup->access = runtime->access; } } return 0; } static void loopback_active_notify(struct loopback_pcm *dpcm) { snd_ctl_notify(dpcm->loopback->card, SNDRV_CTL_EVENT_MASK_VALUE, &get_setup(dpcm)->active_id); } static int loopback_trigger(struct snd_pcm_substream *substream, int cmd) { struct snd_pcm_runtime *runtime = substream->runtime; struct loopback_pcm *dpcm = runtime->private_data; struct loopback_cable *cable = dpcm->cable; int err = 0, stream = 1 << substream->stream; switch (cmd) { case SNDRV_PCM_TRIGGER_START: err = loopback_check_format(cable, substream->stream); if (err < 0) return err; dpcm->last_jiffies = jiffies; dpcm->pcm_rate_shift = 0; dpcm->last_drift = 0; spin_lock(&cable->lock); cable->running |= stream; cable->pause &= ~stream; err = cable->ops->start(dpcm); spin_unlock(&cable->lock); if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) loopback_active_notify(dpcm); break; case SNDRV_PCM_TRIGGER_STOP: spin_lock(&cable->lock); cable->running &= ~stream; cable->pause &= ~stream; err = cable->ops->stop(dpcm); spin_unlock(&cable->lock); if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) loopback_active_notify(dpcm); break; case SNDRV_PCM_TRIGGER_PAUSE_PUSH: case SNDRV_PCM_TRIGGER_SUSPEND: spin_lock(&cable->lock); cable->pause |= stream; err = cable->ops->stop(dpcm); spin_unlock(&cable->lock); if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) loopback_active_notify(dpcm); break; case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: case SNDRV_PCM_TRIGGER_RESUME: spin_lock(&cable->lock); dpcm->last_jiffies = jiffies; cable->pause &= ~stream; err = cable->ops->start(dpcm); spin_unlock(&cable->lock); if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) loopback_active_notify(dpcm); break; default: return -EINVAL; } return err; } static void params_change(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct loopback_pcm *dpcm = runtime->private_data; struct loopback_cable *cable = dpcm->cable; cable->hw.formats = pcm_format_to_bits(runtime->format); cable->hw.rate_min = runtime->rate; cable->hw.rate_max = runtime->rate; cable->hw.channels_min = runtime->channels; cable->hw.channels_max = runtime->channels; if (cable->snd_timer.instance) { cable->hw.period_bytes_min = frames_to_bytes(runtime, runtime->period_size); cable->hw.period_bytes_max = cable->hw.period_bytes_min; } } static int loopback_prepare(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct loopback_pcm *dpcm = runtime->private_data; struct loopback_cable *cable = dpcm->cable; int err, bps, salign; if (cable->ops->stop_sync) { err = cable->ops->stop_sync(dpcm); if (err < 0) return err; } salign = (snd_pcm_format_physical_width(runtime->format) * runtime->channels) / 8; bps = salign * runtime->rate; if (bps <= 0 || salign <= 0) return -EINVAL; dpcm->buf_pos = 0; dpcm->pcm_buffer_size = frames_to_bytes(runtime, runtime->buffer_size); dpcm->channel_buf_n = dpcm->pcm_buffer_size / runtime->channels; if (substream->stream == SNDRV_PCM_STREAM_CAPTURE) { /* clear capture buffer */ dpcm->silent_size = dpcm->pcm_buffer_size; snd_pcm_format_set_silence(runtime->format, runtime->dma_area, runtime->buffer_size * runtime->channels); } dpcm->irq_pos = 0; dpcm->period_update_pending = 0; dpcm->pcm_bps = bps; dpcm->pcm_salign = salign; dpcm->pcm_period_size = frames_to_bytes(runtime, runtime->period_size); mutex_lock(&dpcm->loopback->cable_lock); if (!(cable->valid & ~(1 << substream->stream)) || (get_setup(dpcm)->notify && substream->stream == SNDRV_PCM_STREAM_PLAYBACK)) params_change(substream); cable->valid |= 1 << substream->stream; mutex_unlock(&dpcm->loopback->cable_lock); return 0; } static void clear_capture_buf(struct loopback_pcm *dpcm, unsigned int bytes) { struct snd_pcm_runtime *runtime = dpcm->substream->runtime; char *dst = runtime->dma_area; unsigned int dst_off = dpcm->buf_pos; if (dpcm->silent_size >= dpcm->pcm_buffer_size) return; if (dpcm->silent_size + bytes > dpcm->pcm_buffer_size) bytes = dpcm->pcm_buffer_size - dpcm->silent_size; for (;;) { unsigned int size = bytes; if (dst_off + size > dpcm->pcm_buffer_size) size = dpcm->pcm_buffer_size - dst_off; snd_pcm_format_set_silence(runtime->format, dst + dst_off, bytes_to_frames(runtime, size) * runtime->channels); dpcm->silent_size += size; bytes -= size; if (!bytes) break; dst_off = 0; } } static void copy_play_buf_part_n(struct loopback_pcm *play, struct loopback_pcm *capt, unsigned int size, unsigned int src_off, unsigned int dst_off) { unsigned int channels = capt->substream->runtime->channels; unsigned int size_p_ch = size / channels; unsigned int src_off_ch = src_off / channels; unsigned int dst_off_ch = dst_off / channels; int i; for (i = 0; i < channels; i++) { memcpy(capt->substream->runtime->dma_area + capt->channel_buf_n * i + dst_off_ch, play->substream->runtime->dma_area + play->channel_buf_n * i + src_off_ch, size_p_ch); } } static void copy_play_buf(struct loopback_pcm *play, struct loopback_pcm *capt, unsigned int bytes) { struct snd_pcm_runtime *runtime = play->substream->runtime; char *src = runtime->dma_area; char *dst = capt->substream->runtime->dma_area; unsigned int src_off = play->buf_pos; unsigned int dst_off = capt->buf_pos; unsigned int clear_bytes = 0; /* check if playback is draining, trim the capture copy size * when our pointer is at the end of playback ring buffer */ if (runtime->state == SNDRV_PCM_STATE_DRAINING && snd_pcm_playback_hw_avail(runtime) < runtime->buffer_size) { snd_pcm_uframes_t appl_ptr, appl_ptr1, diff; appl_ptr = appl_ptr1 = runtime->control->appl_ptr; appl_ptr1 -= appl_ptr1 % runtime->buffer_size; appl_ptr1 += play->buf_pos / play->pcm_salign; if (appl_ptr < appl_ptr1) appl_ptr1 -= runtime->buffer_size; diff = (appl_ptr - appl_ptr1) * play->pcm_salign; if (diff < bytes) { clear_bytes = bytes - diff; bytes = diff; } } for (;;) { unsigned int size = bytes; if (src_off + size > play->pcm_buffer_size) size = play->pcm_buffer_size - src_off; if (dst_off + size > capt->pcm_buffer_size) size = capt->pcm_buffer_size - dst_off; if (!is_access_interleaved(runtime->access)) copy_play_buf_part_n(play, capt, size, src_off, dst_off); else memcpy(dst + dst_off, src + src_off, size); capt->silent_size = 0; bytes -= size; if (!bytes) break; src_off = (src_off + size) % play->pcm_buffer_size; dst_off = (dst_off + size) % capt->pcm_buffer_size; } if (clear_bytes > 0) { clear_capture_buf(capt, clear_bytes); capt->silent_size = 0; } } static inline unsigned int bytepos_delta(struct loopback_pcm *dpcm, unsigned int jiffies_delta) { unsigned long last_pos; unsigned int delta; last_pos = byte_pos(dpcm, dpcm->irq_pos); dpcm->irq_pos += jiffies_delta * dpcm->pcm_bps; delta = byte_pos(dpcm, dpcm->irq_pos) - last_pos; if (delta >= dpcm->last_drift) delta -= dpcm->last_drift; dpcm->last_drift = 0; if (dpcm->irq_pos >= dpcm->period_size_frac) { dpcm->irq_pos %= dpcm->period_size_frac; dpcm->period_update_pending = 1; } return delta; } static inline void bytepos_finish(struct loopback_pcm *dpcm, unsigned int delta) { dpcm->buf_pos += delta; dpcm->buf_pos %= dpcm->pcm_buffer_size; } /* call in cable->lock */ static unsigned int loopback_jiffies_timer_pos_update (struct loopback_cable *cable) { struct loopback_pcm *dpcm_play = cable->streams[SNDRV_PCM_STREAM_PLAYBACK]; struct loopback_pcm *dpcm_capt = cable->streams[SNDRV_PCM_STREAM_CAPTURE]; unsigned long delta_play = 0, delta_capt = 0, cur_jiffies; unsigned int running, count1, count2; cur_jiffies = jiffies; running = cable->running ^ cable->pause; if (running & (1 << SNDRV_PCM_STREAM_PLAYBACK)) { delta_play = cur_jiffies - dpcm_play->last_jiffies; dpcm_play->last_jiffies += delta_play; } if (running & (1 << SNDRV_PCM_STREAM_CAPTURE)) { delta_capt = cur_jiffies - dpcm_capt->last_jiffies; dpcm_capt->last_jiffies += delta_capt; } if (delta_play == 0 && delta_capt == 0) goto unlock; if (delta_play > delta_capt) { count1 = bytepos_delta(dpcm_play, delta_play - delta_capt); bytepos_finish(dpcm_play, count1); delta_play = delta_capt; } else if (delta_play < delta_capt) { count1 = bytepos_delta(dpcm_capt, delta_capt - delta_play); clear_capture_buf(dpcm_capt, count1); bytepos_finish(dpcm_capt, count1); delta_capt = delta_play; } if (delta_play == 0 && delta_capt == 0) goto unlock; /* note delta_capt == delta_play at this moment */ count1 = bytepos_delta(dpcm_play, delta_play); count2 = bytepos_delta(dpcm_capt, delta_capt); if (count1 < count2) { dpcm_capt->last_drift = count2 - count1; count1 = count2; } else if (count1 > count2) { dpcm_play->last_drift = count1 - count2; } copy_play_buf(dpcm_play, dpcm_capt, count1); bytepos_finish(dpcm_play, count1); bytepos_finish(dpcm_capt, count1); unlock: return running; } static void loopback_jiffies_timer_function(struct timer_list *t) { struct loopback_pcm *dpcm = from_timer(dpcm, t, timer); unsigned long flags; spin_lock_irqsave(&dpcm->cable->lock, flags); if (loopback_jiffies_timer_pos_update(dpcm->cable) & (1 << dpcm->substream->stream)) { loopback_jiffies_timer_start(dpcm); if (dpcm->period_update_pending) { dpcm->period_update_pending = 0; spin_unlock_irqrestore(&dpcm->cable->lock, flags); /* need to unlock before calling below */ snd_pcm_period_elapsed(dpcm->substream); return; } } spin_unlock_irqrestore(&dpcm->cable->lock, flags); } /* call in cable->lock */ static int loopback_snd_timer_check_resolution(struct snd_pcm_runtime *runtime, unsigned long resolution) { if (resolution != runtime->timer_resolution) { struct loopback_pcm *dpcm = runtime->private_data; struct loopback_cable *cable = dpcm->cable; /* Worst case estimation of possible values for resolution * resolution <= (512 * 1024) frames / 8kHz in nsec * resolution <= 65.536.000.000 nsec * * period_size <= 65.536.000.000 nsec / 1000nsec/usec * 192kHz + * 500.000 * period_size <= 12.582.912.000.000 <64bit * / 1.000.000 usec/sec */ snd_pcm_uframes_t period_size_usec = resolution / 1000 * runtime->rate; /* round to nearest sample rate */ snd_pcm_uframes_t period_size = (period_size_usec + 500 * 1000) / (1000 * 1000); pcm_err(dpcm->substream->pcm, "Period size (%lu frames) of loopback device is not corresponding to timer resolution (%lu nsec = %lu frames) of card timer %d,%d,%d. Use period size of %lu frames for loopback device.", runtime->period_size, resolution, period_size, cable->snd_timer.id.card, cable->snd_timer.id.device, cable->snd_timer.id.subdevice, period_size); return -EINVAL; } return 0; } static void loopback_snd_timer_period_elapsed(struct loopback_cable *cable, int event, unsigned long resolution) { struct loopback_pcm *dpcm_play, *dpcm_capt; struct snd_pcm_substream *substream_play, *substream_capt; struct snd_pcm_runtime *valid_runtime; unsigned int running, elapsed_bytes; unsigned long flags; spin_lock_irqsave(&cable->lock, flags); running = cable->running ^ cable->pause; /* no need to do anything if no stream is running */ if (!running) { spin_unlock_irqrestore(&cable->lock, flags); return; } dpcm_play = cable->streams[SNDRV_PCM_STREAM_PLAYBACK]; dpcm_capt = cable->streams[SNDRV_PCM_STREAM_CAPTURE]; if (event == SNDRV_TIMER_EVENT_MSTOP) { if (!dpcm_play || dpcm_play->substream->runtime->state != SNDRV_PCM_STATE_DRAINING) { spin_unlock_irqrestore(&cable->lock, flags); return; } } substream_play = (running & (1 << SNDRV_PCM_STREAM_PLAYBACK)) ? dpcm_play->substream : NULL; substream_capt = (running & (1 << SNDRV_PCM_STREAM_CAPTURE)) ? dpcm_capt->substream : NULL; valid_runtime = (running & (1 << SNDRV_PCM_STREAM_PLAYBACK)) ? dpcm_play->substream->runtime : dpcm_capt->substream->runtime; /* resolution is only valid for SNDRV_TIMER_EVENT_TICK events */ if (event == SNDRV_TIMER_EVENT_TICK) { /* The hardware rules guarantee that playback and capture period * are the same. Therefore only one device has to be checked * here. */ if (loopback_snd_timer_check_resolution(valid_runtime, resolution) < 0) { spin_unlock_irqrestore(&cable->lock, flags); if (substream_play) snd_pcm_stop_xrun(substream_play); if (substream_capt) snd_pcm_stop_xrun(substream_capt); return; } } elapsed_bytes = frames_to_bytes(valid_runtime, valid_runtime->period_size); /* The same timer interrupt is used for playback and capture device */ if ((running & (1 << SNDRV_PCM_STREAM_PLAYBACK)) && (running & (1 << SNDRV_PCM_STREAM_CAPTURE))) { copy_play_buf(dpcm_play, dpcm_capt, elapsed_bytes); bytepos_finish(dpcm_play, elapsed_bytes); bytepos_finish(dpcm_capt, elapsed_bytes); } else if (running & (1 << SNDRV_PCM_STREAM_PLAYBACK)) { bytepos_finish(dpcm_play, elapsed_bytes); } else if (running & (1 << SNDRV_PCM_STREAM_CAPTURE)) { clear_capture_buf(dpcm_capt, elapsed_bytes); bytepos_finish(dpcm_capt, elapsed_bytes); } spin_unlock_irqrestore(&cable->lock, flags); if (substream_play) snd_pcm_period_elapsed(substream_play); if (substream_capt) snd_pcm_period_elapsed(substream_capt); } static void loopback_snd_timer_function(struct snd_timer_instance *timeri, unsigned long resolution, unsigned long ticks) { struct loopback_cable *cable = timeri->callback_data; loopback_snd_timer_period_elapsed(cable, SNDRV_TIMER_EVENT_TICK, resolution); } static void loopback_snd_timer_work(struct work_struct *work) { struct loopback_cable *cable; cable = container_of(work, struct loopback_cable, snd_timer.event_work); loopback_snd_timer_period_elapsed(cable, SNDRV_TIMER_EVENT_MSTOP, 0); } static void loopback_snd_timer_event(struct snd_timer_instance *timeri, int event, struct timespec64 *tstamp, unsigned long resolution) { /* Do not lock cable->lock here because timer->lock is already hold. * There are other functions which first lock cable->lock and than * timer->lock e.g. * loopback_trigger() * spin_lock(&cable->lock) * loopback_snd_timer_start() * snd_timer_start() * spin_lock(&timer->lock) * Therefore when using the oposit order of locks here it could result * in a deadlock. */ if (event == SNDRV_TIMER_EVENT_MSTOP) { struct loopback_cable *cable = timeri->callback_data; /* sound card of the timer was stopped. Therefore there will not * be any further timer callbacks. Due to this forward audio * data from here if in draining state. When still in running * state the streaming will be aborted by the usual timeout. It * should not be aborted here because may be the timer sound * card does only a recovery and the timer is back soon. * This work triggers loopback_snd_timer_work() */ schedule_work(&cable->snd_timer.event_work); } } static void loopback_jiffies_timer_dpcm_info(struct loopback_pcm *dpcm, struct snd_info_buffer *buffer) { snd_iprintf(buffer, " update_pending:\t%u\n", dpcm->period_update_pending); snd_iprintf(buffer, " irq_pos:\t\t%u\n", dpcm->irq_pos); snd_iprintf(buffer, " period_frac:\t%u\n", dpcm->period_size_frac); snd_iprintf(buffer, " last_jiffies:\t%lu (%lu)\n", dpcm->last_jiffies, jiffies); snd_iprintf(buffer, " timer_expires:\t%lu\n", dpcm->timer.expires); } static void loopback_snd_timer_dpcm_info(struct loopback_pcm *dpcm, struct snd_info_buffer *buffer) { struct loopback_cable *cable = dpcm->cable; snd_iprintf(buffer, " sound timer:\thw:%d,%d,%d\n", cable->snd_timer.id.card, cable->snd_timer.id.device, cable->snd_timer.id.subdevice); snd_iprintf(buffer, " timer open:\t\t%s\n", (cable->snd_timer.stream == SNDRV_PCM_STREAM_CAPTURE) ? "capture" : "playback"); } static snd_pcm_uframes_t loopback_pointer(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct loopback_pcm *dpcm = runtime->private_data; snd_pcm_uframes_t pos; spin_lock(&dpcm->cable->lock); if (dpcm->cable->ops->pos_update) dpcm->cable->ops->pos_update(dpcm->cable); pos = dpcm->buf_pos; spin_unlock(&dpcm->cable->lock); return bytes_to_frames(runtime, pos); } static const struct snd_pcm_hardware loopback_pcm_hardware = { .info = (SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_MMAP_VALID | SNDRV_PCM_INFO_PAUSE | SNDRV_PCM_INFO_RESUME | SNDRV_PCM_INFO_NONINTERLEAVED), .formats = (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S16_BE | SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_S24_BE | SNDRV_PCM_FMTBIT_S24_3LE | SNDRV_PCM_FMTBIT_S24_3BE | SNDRV_PCM_FMTBIT_S32_LE | SNDRV_PCM_FMTBIT_S32_BE | SNDRV_PCM_FMTBIT_FLOAT_LE | SNDRV_PCM_FMTBIT_FLOAT_BE | SNDRV_PCM_FMTBIT_DSD_U8 | SNDRV_PCM_FMTBIT_DSD_U16_LE | SNDRV_PCM_FMTBIT_DSD_U16_BE | SNDRV_PCM_FMTBIT_DSD_U32_LE | SNDRV_PCM_FMTBIT_DSD_U32_BE), .rates = SNDRV_PCM_RATE_CONTINUOUS | SNDRV_PCM_RATE_8000_768000, .rate_min = 8000, .rate_max = 768000, .channels_min = 1, .channels_max = 32, .buffer_bytes_max = 2 * 1024 * 1024, .period_bytes_min = 64, /* note check overflow in frac_pos() using pcm_rate_shift before changing period_bytes_max value */ .period_bytes_max = 1024 * 1024, .periods_min = 1, .periods_max = 1024, .fifo_size = 0, }; static void loopback_runtime_free(struct snd_pcm_runtime *runtime) { struct loopback_pcm *dpcm = runtime->private_data; kfree(dpcm); } static int loopback_hw_free(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct loopback_pcm *dpcm = runtime->private_data; struct loopback_cable *cable = dpcm->cable; mutex_lock(&dpcm->loopback->cable_lock); cable->valid &= ~(1 << substream->stream); mutex_unlock(&dpcm->loopback->cable_lock); return 0; } static unsigned int get_cable_index(struct snd_pcm_substream *substream) { if (!substream->pcm->device) return substream->stream; else return !substream->stream; } static int rule_format(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct loopback_pcm *dpcm = rule->private; struct loopback_cable *cable = dpcm->cable; struct snd_mask m; snd_mask_none(&m); mutex_lock(&dpcm->loopback->cable_lock); m.bits[0] = (u_int32_t)cable->hw.formats; m.bits[1] = (u_int32_t)(cable->hw.formats >> 32); mutex_unlock(&dpcm->loopback->cable_lock); return snd_mask_refine(hw_param_mask(params, rule->var), &m); } static int rule_rate(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct loopback_pcm *dpcm = rule->private; struct loopback_cable *cable = dpcm->cable; struct snd_interval t; mutex_lock(&dpcm->loopback->cable_lock); t.min = cable->hw.rate_min; t.max = cable->hw.rate_max; mutex_unlock(&dpcm->loopback->cable_lock); t.openmin = t.openmax = 0; t.integer = 0; return snd_interval_refine(hw_param_interval(params, rule->var), &t); } static int rule_channels(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct loopback_pcm *dpcm = rule->private; struct loopback_cable *cable = dpcm->cable; struct snd_interval t; mutex_lock(&dpcm->loopback->cable_lock); t.min = cable->hw.channels_min; t.max = cable->hw.channels_max; mutex_unlock(&dpcm->loopback->cable_lock); t.openmin = t.openmax = 0; t.integer = 0; return snd_interval_refine(hw_param_interval(params, rule->var), &t); } static int rule_period_bytes(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct loopback_pcm *dpcm = rule->private; struct loopback_cable *cable = dpcm->cable; struct snd_interval t; mutex_lock(&dpcm->loopback->cable_lock); t.min = cable->hw.period_bytes_min; t.max = cable->hw.period_bytes_max; mutex_unlock(&dpcm->loopback->cable_lock); t.openmin = 0; t.openmax = 0; t.integer = 0; return snd_interval_refine(hw_param_interval(params, rule->var), &t); } static void free_cable(struct snd_pcm_substream *substream) { struct loopback *loopback = substream->private_data; int dev = get_cable_index(substream); struct loopback_cable *cable; cable = loopback->cables[substream->number][dev]; if (!cable) return; if (cable->streams[!substream->stream]) { /* other stream is still alive */ spin_lock_irq(&cable->lock); cable->streams[substream->stream] = NULL; spin_unlock_irq(&cable->lock); } else { struct loopback_pcm *dpcm = substream->runtime->private_data; if (cable->ops && cable->ops->close_cable && dpcm) cable->ops->close_cable(dpcm); /* free the cable */ loopback->cables[substream->number][dev] = NULL; kfree(cable); } } static int loopback_jiffies_timer_open(struct loopback_pcm *dpcm) { timer_setup(&dpcm->timer, loopback_jiffies_timer_function, 0); return 0; } static const struct loopback_ops loopback_jiffies_timer_ops = { .open = loopback_jiffies_timer_open, .start = loopback_jiffies_timer_start, .stop = loopback_jiffies_timer_stop, .stop_sync = loopback_jiffies_timer_stop_sync, .close_substream = loopback_jiffies_timer_stop_sync, .pos_update = loopback_jiffies_timer_pos_update, .dpcm_info = loopback_jiffies_timer_dpcm_info, }; static int loopback_parse_timer_id(const char *str, struct snd_timer_id *tid) { /* [<pref>:](<card name>|<card idx>)[{.,}<dev idx>[{.,}<subdev idx>]] */ const char * const sep_dev = ".,"; const char * const sep_pref = ":"; const char *name = str; char *sep, save = '\0'; int card_idx = 0, dev = 0, subdev = 0; int err; sep = strpbrk(str, sep_pref); if (sep) name = sep + 1; sep = strpbrk(name, sep_dev); if (sep) { save = *sep; *sep = '\0'; } err = kstrtoint(name, 0, &card_idx); if (err == -EINVAL) { /* Must be the name, not number */ for (card_idx = 0; card_idx < snd_ecards_limit; card_idx++) { struct snd_card *card = snd_card_ref(card_idx); if (card) { if (!strcmp(card->id, name)) err = 0; snd_card_unref(card); } if (!err) break; } } if (sep) { *sep = save; if (!err) { char *sep2, save2 = '\0'; sep2 = strpbrk(sep + 1, sep_dev); if (sep2) { save2 = *sep2; *sep2 = '\0'; } err = kstrtoint(sep + 1, 0, &dev); if (sep2) { *sep2 = save2; if (!err) err = kstrtoint(sep2 + 1, 0, &subdev); } } } if (!err && tid) { tid->card = card_idx; tid->device = dev; tid->subdevice = subdev; } return err; } /* call in loopback->cable_lock */ static int loopback_snd_timer_open(struct loopback_pcm *dpcm) { int err = 0; struct snd_timer_id tid = { .dev_class = SNDRV_TIMER_CLASS_PCM, .dev_sclass = SNDRV_TIMER_SCLASS_APPLICATION, }; struct snd_timer_instance *timeri; struct loopback_cable *cable = dpcm->cable; /* check if timer was already opened. It is only opened once * per playback and capture subdevice (aka cable). */ if (cable->snd_timer.instance) goto exit; err = loopback_parse_timer_id(dpcm->loopback->timer_source, &tid); if (err < 0) { pcm_err(dpcm->substream->pcm, "Parsing timer source \'%s\' failed with %d", dpcm->loopback->timer_source, err); goto exit; } cable->snd_timer.stream = dpcm->substream->stream; cable->snd_timer.id = tid; timeri = snd_timer_instance_new(dpcm->loopback->card->id); if (!timeri) { err = -ENOMEM; goto exit; } /* The callback has to be called from another work. If * SNDRV_TIMER_IFLG_FAST is specified it will be called from the * snd_pcm_period_elapsed() call of the selected sound card. * snd_pcm_period_elapsed() helds snd_pcm_stream_lock_irqsave(). * Due to our callback loopback_snd_timer_function() also calls * snd_pcm_period_elapsed() which calls snd_pcm_stream_lock_irqsave(). * This would end up in a dead lock. */ timeri->flags |= SNDRV_TIMER_IFLG_AUTO; timeri->callback = loopback_snd_timer_function; timeri->callback_data = (void *)cable; timeri->ccallback = loopback_snd_timer_event; /* initialise a work used for draining */ INIT_WORK(&cable->snd_timer.event_work, loopback_snd_timer_work); /* The mutex loopback->cable_lock is kept locked. * Therefore snd_timer_open() cannot be called a second time * by the other device of the same cable. * Therefore the following issue cannot happen: * [proc1] Call loopback_timer_open() -> * Unlock cable->lock for snd_timer_close/open() call * [proc2] Call loopback_timer_open() -> snd_timer_open(), * snd_timer_start() * [proc1] Call snd_timer_open() and overwrite running timer * instance */ err = snd_timer_open(timeri, &cable->snd_timer.id, current->pid); if (err < 0) { pcm_err(dpcm->substream->pcm, "snd_timer_open (%d,%d,%d) failed with %d", cable->snd_timer.id.card, cable->snd_timer.id.device, cable->snd_timer.id.subdevice, err); snd_timer_instance_free(timeri); goto exit; } cable->snd_timer.instance = timeri; exit: return err; } /* stop_sync() is not required for sound timer because it does not need to be * restarted in loopback_prepare() on Xrun recovery */ static const struct loopback_ops loopback_snd_timer_ops = { .open = loopback_snd_timer_open, .start = loopback_snd_timer_start, .stop = loopback_snd_timer_stop, .close_cable = loopback_snd_timer_close_cable, .dpcm_info = loopback_snd_timer_dpcm_info, }; static int loopback_open(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct loopback *loopback = substream->private_data; struct loopback_pcm *dpcm; struct loopback_cable *cable = NULL; int err = 0; int dev = get_cable_index(substream); mutex_lock(&loopback->cable_lock); dpcm = kzalloc(sizeof(*dpcm), GFP_KERNEL); if (!dpcm) { err = -ENOMEM; goto unlock; } dpcm->loopback = loopback; dpcm->substream = substream; cable = loopback->cables[substream->number][dev]; if (!cable) { cable = kzalloc(sizeof(*cable), GFP_KERNEL); if (!cable) { err = -ENOMEM; goto unlock; } spin_lock_init(&cable->lock); cable->hw = loopback_pcm_hardware; if (loopback->timer_source) cable->ops = &loopback_snd_timer_ops; else cable->ops = &loopback_jiffies_timer_ops; loopback->cables[substream->number][dev] = cable; } dpcm->cable = cable; runtime->private_data = dpcm; if (cable->ops->open) { err = cable->ops->open(dpcm); if (err < 0) goto unlock; } snd_pcm_hw_constraint_integer(runtime, SNDRV_PCM_HW_PARAM_PERIODS); /* use dynamic rules based on actual runtime->hw values */ /* note that the default rules created in the PCM midlevel code */ /* are cached -> they do not reflect the actual state */ err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_FORMAT, rule_format, dpcm, SNDRV_PCM_HW_PARAM_FORMAT, -1); if (err < 0) goto unlock; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, rule_rate, dpcm, SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) goto unlock; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_CHANNELS, rule_channels, dpcm, SNDRV_PCM_HW_PARAM_CHANNELS, -1); if (err < 0) goto unlock; /* In case of sound timer the period time of both devices of the same * loop has to be the same. * This rule only takes effect if a sound timer was chosen */ if (cable->snd_timer.instance) { err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_BYTES, rule_period_bytes, dpcm, SNDRV_PCM_HW_PARAM_PERIOD_BYTES, -1); if (err < 0) goto unlock; } /* loopback_runtime_free() has not to be called if kfree(dpcm) was * already called here. Otherwise it will end up with a double free. */ runtime->private_free = loopback_runtime_free; if (get_notify(dpcm)) runtime->hw = loopback_pcm_hardware; else runtime->hw = cable->hw; spin_lock_irq(&cable->lock); cable->streams[substream->stream] = dpcm; spin_unlock_irq(&cable->lock); unlock: if (err < 0) { free_cable(substream); kfree(dpcm); } mutex_unlock(&loopback->cable_lock); return err; } static int loopback_close(struct snd_pcm_substream *substream) { struct loopback *loopback = substream->private_data; struct loopback_pcm *dpcm = substream->runtime->private_data; int err = 0; if (dpcm->cable->ops->close_substream) err = dpcm->cable->ops->close_substream(dpcm); mutex_lock(&loopback->cable_lock); free_cable(substream); mutex_unlock(&loopback->cable_lock); return err; } static const struct snd_pcm_ops loopback_pcm_ops = { .open = loopback_open, .close = loopback_close, .hw_free = loopback_hw_free, .prepare = loopback_prepare, .trigger = loopback_trigger, .pointer = loopback_pointer, }; static int loopback_pcm_new(struct loopback *loopback, int device, int substreams) { struct snd_pcm *pcm; int err; err = snd_pcm_new(loopback->card, "Loopback PCM", device, substreams, substreams, &pcm); if (err < 0) return err; snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &loopback_pcm_ops); snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_CAPTURE, &loopback_pcm_ops); snd_pcm_set_managed_buffer_all(pcm, SNDRV_DMA_TYPE_VMALLOC, NULL, 0, 0); pcm->private_data = loopback; pcm->info_flags = 0; strcpy(pcm->name, "Loopback PCM"); loopback->pcm[device] = pcm; return 0; } static int loopback_rate_shift_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 1; uinfo->value.integer.min = 80000; uinfo->value.integer.max = 120000; uinfo->value.integer.step = 1; return 0; } static int loopback_rate_shift_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); mutex_lock(&loopback->cable_lock); ucontrol->value.integer.value[0] = loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].rate_shift; mutex_unlock(&loopback->cable_lock); return 0; } static int loopback_rate_shift_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); unsigned int val; int change = 0; val = ucontrol->value.integer.value[0]; if (val < 80000) val = 80000; if (val > 120000) val = 120000; mutex_lock(&loopback->cable_lock); if (val != loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].rate_shift) { loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].rate_shift = val; change = 1; } mutex_unlock(&loopback->cable_lock); return change; } static int loopback_notify_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); mutex_lock(&loopback->cable_lock); ucontrol->value.integer.value[0] = loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].notify; mutex_unlock(&loopback->cable_lock); return 0; } static int loopback_notify_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); unsigned int val; int change = 0; val = ucontrol->value.integer.value[0] ? 1 : 0; mutex_lock(&loopback->cable_lock); if (val != loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].notify) { loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].notify = val; change = 1; } mutex_unlock(&loopback->cable_lock); return change; } static int loopback_active_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); struct loopback_cable *cable; unsigned int val = 0; mutex_lock(&loopback->cable_lock); cable = loopback->cables[kcontrol->id.subdevice][kcontrol->id.device ^ 1]; if (cable != NULL) { unsigned int running = cable->running ^ cable->pause; val = (running & (1 << SNDRV_PCM_STREAM_PLAYBACK)) ? 1 : 0; } mutex_unlock(&loopback->cable_lock); ucontrol->value.integer.value[0] = val; return 0; } static int loopback_format_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 1; uinfo->value.integer.min = 0; uinfo->value.integer.max = (__force int)SNDRV_PCM_FORMAT_LAST; uinfo->value.integer.step = 1; return 0; } static int loopback_format_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); ucontrol->value.integer.value[0] = (__force int)loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].format; return 0; } static int loopback_rate_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 1; uinfo->value.integer.min = 0; uinfo->value.integer.max = 192000; uinfo->value.integer.step = 1; return 0; } static int loopback_rate_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); mutex_lock(&loopback->cable_lock); ucontrol->value.integer.value[0] = loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].rate; mutex_unlock(&loopback->cable_lock); return 0; } static int loopback_channels_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 1; uinfo->value.integer.min = 1; uinfo->value.integer.max = 1024; uinfo->value.integer.step = 1; return 0; } static int loopback_channels_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); mutex_lock(&loopback->cable_lock); ucontrol->value.integer.value[0] = loopback->setup[kcontrol->id.subdevice] [kcontrol->id.device].channels; mutex_unlock(&loopback->cable_lock); return 0; } static int loopback_access_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { const char * const texts[] = {"Interleaved", "Non-interleaved"}; return snd_ctl_enum_info(uinfo, 1, ARRAY_SIZE(texts), texts); } static int loopback_access_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct loopback *loopback = snd_kcontrol_chip(kcontrol); snd_pcm_access_t access; mutex_lock(&loopback->cable_lock); access = loopback->setup[kcontrol->id.subdevice][kcontrol->id.device].access; ucontrol->value.enumerated.item[0] = !is_access_interleaved(access); mutex_unlock(&loopback->cable_lock); return 0; } static const struct snd_kcontrol_new loopback_controls[] = { { .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = "PCM Rate Shift 100000", .info = loopback_rate_shift_info, .get = loopback_rate_shift_get, .put = loopback_rate_shift_put, }, { .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = "PCM Notify", .info = snd_ctl_boolean_mono_info, .get = loopback_notify_get, .put = loopback_notify_put, }, #define ACTIVE_IDX 2 { .access = SNDRV_CTL_ELEM_ACCESS_READ, .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = "PCM Slave Active", .info = snd_ctl_boolean_mono_info, .get = loopback_active_get, }, #define FORMAT_IDX 3 { .access = SNDRV_CTL_ELEM_ACCESS_READ, .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = "PCM Slave Format", .info = loopback_format_info, .get = loopback_format_get }, #define RATE_IDX 4 { .access = SNDRV_CTL_ELEM_ACCESS_READ, .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = "PCM Slave Rate", .info = loopback_rate_info, .get = loopback_rate_get }, #define CHANNELS_IDX 5 { .access = SNDRV_CTL_ELEM_ACCESS_READ, .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = "PCM Slave Channels", .info = loopback_channels_info, .get = loopback_channels_get }, #define ACCESS_IDX 6 { .access = SNDRV_CTL_ELEM_ACCESS_READ, .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = "PCM Slave Access Mode", .info = loopback_access_info, .get = loopback_access_get, }, }; static int loopback_mixer_new(struct loopback *loopback, int notify) { struct snd_card *card = loopback->card; struct snd_pcm *pcm; struct snd_kcontrol *kctl; struct loopback_setup *setup; int err, dev, substr, substr_count, idx; strcpy(card->mixername, "Loopback Mixer"); for (dev = 0; dev < 2; dev++) { pcm = loopback->pcm[dev]; substr_count = pcm->streams[SNDRV_PCM_STREAM_CAPTURE].substream_count; for (substr = 0; substr < substr_count; substr++) { setup = &loopback->setup[substr][dev]; setup->notify = notify; setup->rate_shift = NO_PITCH; setup->format = SNDRV_PCM_FORMAT_S16_LE; setup->access = SNDRV_PCM_ACCESS_RW_INTERLEAVED; setup->rate = 48000; setup->channels = 2; for (idx = 0; idx < ARRAY_SIZE(loopback_controls); idx++) { kctl = snd_ctl_new1(&loopback_controls[idx], loopback); if (!kctl) return -ENOMEM; kctl->id.device = dev; kctl->id.subdevice = substr; /* Add the control before copying the id so that * the numid field of the id is set in the copy. */ err = snd_ctl_add(card, kctl); if (err < 0) return err; switch (idx) { case ACTIVE_IDX: setup->active_id = kctl->id; break; case FORMAT_IDX: setup->format_id = kctl->id; break; case RATE_IDX: setup->rate_id = kctl->id; break; case CHANNELS_IDX: setup->channels_id = kctl->id; break; case ACCESS_IDX: setup->access_id = kctl->id; break; default: break; } } } } return 0; } static void print_dpcm_info(struct snd_info_buffer *buffer, struct loopback_pcm *dpcm, const char *id) { snd_iprintf(buffer, " %s\n", id); if (dpcm == NULL) { snd_iprintf(buffer, " inactive\n"); return; } snd_iprintf(buffer, " buffer_size:\t%u\n", dpcm->pcm_buffer_size); snd_iprintf(buffer, " buffer_pos:\t\t%u\n", dpcm->buf_pos); snd_iprintf(buffer, " silent_size:\t%u\n", dpcm->silent_size); snd_iprintf(buffer, " period_size:\t%u\n", dpcm->pcm_period_size); snd_iprintf(buffer, " bytes_per_sec:\t%u\n", dpcm->pcm_bps); snd_iprintf(buffer, " sample_align:\t%u\n", dpcm->pcm_salign); snd_iprintf(buffer, " rate_shift:\t\t%u\n", dpcm->pcm_rate_shift); if (dpcm->cable->ops->dpcm_info) dpcm->cable->ops->dpcm_info(dpcm, buffer); } static void print_substream_info(struct snd_info_buffer *buffer, struct loopback *loopback, int sub, int num) { struct loopback_cable *cable = loopback->cables[sub][num]; snd_iprintf(buffer, "Cable %i substream %i:\n", num, sub); if (cable == NULL) { snd_iprintf(buffer, " inactive\n"); return; } snd_iprintf(buffer, " valid: %u\n", cable->valid); snd_iprintf(buffer, " running: %u\n", cable->running); snd_iprintf(buffer, " pause: %u\n", cable->pause); print_dpcm_info(buffer, cable->streams[0], "Playback"); print_dpcm_info(buffer, cable->streams[1], "Capture"); } static void print_cable_info(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct loopback *loopback = entry->private_data; int sub, num; mutex_lock(&loopback->cable_lock); num = entry->name[strlen(entry->name)-1]; num = num == '0' ? 0 : 1; for (sub = 0; sub < MAX_PCM_SUBSTREAMS; sub++) print_substream_info(buffer, loopback, sub, num); mutex_unlock(&loopback->cable_lock); } static int loopback_cable_proc_new(struct loopback *loopback, int cidx) { char name[32]; snprintf(name, sizeof(name), "cable#%d", cidx); return snd_card_ro_proc_new(loopback->card, name, loopback, print_cable_info); } static void loopback_set_timer_source(struct loopback *loopback, const char *value) { if (loopback->timer_source) { devm_kfree(loopback->card->dev, loopback->timer_source); loopback->timer_source = NULL; } if (value && *value) loopback->timer_source = devm_kstrdup(loopback->card->dev, value, GFP_KERNEL); } static void print_timer_source_info(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct loopback *loopback = entry->private_data; mutex_lock(&loopback->cable_lock); snd_iprintf(buffer, "%s\n", loopback->timer_source ? loopback->timer_source : ""); mutex_unlock(&loopback->cable_lock); } static void change_timer_source_info(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct loopback *loopback = entry->private_data; char line[64]; mutex_lock(&loopback->cable_lock); if (!snd_info_get_line(buffer, line, sizeof(line))) loopback_set_timer_source(loopback, strim(line)); mutex_unlock(&loopback->cable_lock); } static int loopback_timer_source_proc_new(struct loopback *loopback) { return snd_card_rw_proc_new(loopback->card, "timer_source", loopback, print_timer_source_info, change_timer_source_info); } static int loopback_probe(struct platform_device *devptr) { struct snd_card *card; struct loopback *loopback; int dev = devptr->id; int err; err = snd_devm_card_new(&devptr->dev, index[dev], id[dev], THIS_MODULE, sizeof(struct loopback), &card); if (err < 0) return err; loopback = card->private_data; if (pcm_substreams[dev] < 1) pcm_substreams[dev] = 1; if (pcm_substreams[dev] > MAX_PCM_SUBSTREAMS) pcm_substreams[dev] = MAX_PCM_SUBSTREAMS; loopback->card = card; loopback_set_timer_source(loopback, timer_source[dev]); mutex_init(&loopback->cable_lock); err = loopback_pcm_new(loopback, 0, pcm_substreams[dev]); if (err < 0) return err; err = loopback_pcm_new(loopback, 1, pcm_substreams[dev]); if (err < 0) return err; err = loopback_mixer_new(loopback, pcm_notify[dev] ? 1 : 0); if (err < 0) return err; loopback_cable_proc_new(loopback, 0); loopback_cable_proc_new(loopback, 1); loopback_timer_source_proc_new(loopback); strcpy(card->driver, "Loopback"); strcpy(card->shortname, "Loopback"); sprintf(card->longname, "Loopback %i", dev + 1); err = snd_card_register(card); if (err < 0) return err; platform_set_drvdata(devptr, card); return 0; } static int loopback_suspend(struct device *pdev) { struct snd_card *card = dev_get_drvdata(pdev); snd_power_change_state(card, SNDRV_CTL_POWER_D3hot); return 0; } static int loopback_resume(struct device *pdev) { struct snd_card *card = dev_get_drvdata(pdev); snd_power_change_state(card, SNDRV_CTL_POWER_D0); return 0; } static DEFINE_SIMPLE_DEV_PM_OPS(loopback_pm, loopback_suspend, loopback_resume); #define SND_LOOPBACK_DRIVER "snd_aloop" static struct platform_driver loopback_driver = { .probe = loopback_probe, .driver = { .name = SND_LOOPBACK_DRIVER, .pm = &loopback_pm, }, }; static void loopback_unregister_all(void) { int i; for (i = 0; i < ARRAY_SIZE(devices); ++i) platform_device_unregister(devices[i]); platform_driver_unregister(&loopback_driver); } static int __init alsa_card_loopback_init(void) { int i, err, cards; err = platform_driver_register(&loopback_driver); if (err < 0) return err; cards = 0; for (i = 0; i < SNDRV_CARDS; i++) { struct platform_device *device; if (!enable[i]) continue; device = platform_device_register_simple(SND_LOOPBACK_DRIVER, i, NULL, 0); if (IS_ERR(device)) continue; if (!platform_get_drvdata(device)) { platform_device_unregister(device); continue; } devices[i] = device; cards++; } if (!cards) { #ifdef MODULE printk(KERN_ERR "aloop: No loopback enabled\n"); #endif loopback_unregister_all(); return -ENODEV; } return 0; } static void __exit alsa_card_loopback_exit(void) { loopback_unregister_all(); } module_init(alsa_card_loopback_init) module_exit(alsa_card_loopback_exit)
17 27 12 15 12 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 /* * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #ifndef __DRM_FRAMEBUFFER_H__ #define __DRM_FRAMEBUFFER_H__ #include <linux/ctype.h> #include <linux/list.h> #include <linux/sched.h> #include <drm/drm_fourcc.h> #include <drm/drm_mode_object.h> struct drm_clip_rect; struct drm_device; struct drm_file; struct drm_framebuffer; struct drm_gem_object; /** * struct drm_framebuffer_funcs - framebuffer hooks */ struct drm_framebuffer_funcs { /** * @destroy: * * Clean up framebuffer resources, specifically also unreference the * backing storage. The core guarantees to call this function for every * framebuffer successfully created by calling * &drm_mode_config_funcs.fb_create. Drivers must also call * drm_framebuffer_cleanup() to release DRM core resources for this * framebuffer. */ void (*destroy)(struct drm_framebuffer *framebuffer); /** * @create_handle: * * Create a buffer handle in the driver-specific buffer manager (either * GEM or TTM) valid for the passed-in &struct drm_file. This is used by * the core to implement the GETFB IOCTL, which returns (for * sufficiently priviledged user) also a native buffer handle. This can * be used for seamless transitions between modesetting clients by * copying the current screen contents to a private buffer and blending * between that and the new contents. * * GEM based drivers should call drm_gem_handle_create() to create the * handle. * * RETURNS: * * 0 on success or a negative error code on failure. */ int (*create_handle)(struct drm_framebuffer *fb, struct drm_file *file_priv, unsigned int *handle); /** * @dirty: * * Optional callback for the dirty fb IOCTL. * * Userspace can notify the driver via this callback that an area of the * framebuffer has changed and should be flushed to the display * hardware. This can also be used internally, e.g. by the fbdev * emulation, though that's not the case currently. * * See documentation in drm_mode.h for the struct drm_mode_fb_dirty_cmd * for more information as all the semantics and arguments have a one to * one mapping on this function. * * Atomic drivers should use drm_atomic_helper_dirtyfb() to implement * this hook. * * RETURNS: * * 0 on success or a negative error code on failure. */ int (*dirty)(struct drm_framebuffer *framebuffer, struct drm_file *file_priv, unsigned flags, unsigned color, struct drm_clip_rect *clips, unsigned num_clips); }; /** * struct drm_framebuffer - frame buffer object * * Note that the fb is refcounted for the benefit of driver internals, * for example some hw, disabling a CRTC/plane is asynchronous, and * scanout does not actually complete until the next vblank. So some * cleanup (like releasing the reference(s) on the backing GEM bo(s)) * should be deferred. In cases like this, the driver would like to * hold a ref to the fb even though it has already been removed from * userspace perspective. See drm_framebuffer_get() and * drm_framebuffer_put(). * * The refcount is stored inside the mode object @base. */ struct drm_framebuffer { /** * @dev: DRM device this framebuffer belongs to */ struct drm_device *dev; /** * @head: Place on the &drm_mode_config.fb_list, access protected by * &drm_mode_config.fb_lock. */ struct list_head head; /** * @base: base modeset object structure, contains the reference count. */ struct drm_mode_object base; /** * @comm: Name of the process allocating the fb, used for fb dumping. */ char comm[TASK_COMM_LEN]; /** * @format: framebuffer format information */ const struct drm_format_info *format; /** * @funcs: framebuffer vfunc table */ const struct drm_framebuffer_funcs *funcs; /** * @pitches: Line stride per buffer. For userspace created object this * is copied from drm_mode_fb_cmd2. */ unsigned int pitches[DRM_FORMAT_MAX_PLANES]; /** * @offsets: Offset from buffer start to the actual pixel data in bytes, * per buffer. For userspace created object this is copied from * drm_mode_fb_cmd2. * * Note that this is a linear offset and does not take into account * tiling or buffer layout per @modifier. It is meant to be used when * the actual pixel data for this framebuffer plane starts at an offset, * e.g. when multiple planes are allocated within the same backing * storage buffer object. For tiled layouts this generally means its * @offsets must at least be tile-size aligned, but hardware often has * stricter requirements. * * This should not be used to specifiy x/y pixel offsets into the buffer * data (even for linear buffers). Specifying an x/y pixel offset is * instead done through the source rectangle in &struct drm_plane_state. */ unsigned int offsets[DRM_FORMAT_MAX_PLANES]; /** * @modifier: Data layout modifier. This is used to describe * tiling, or also special layouts (like compression) of auxiliary * buffers. For userspace created object this is copied from * drm_mode_fb_cmd2. */ uint64_t modifier; /** * @width: Logical width of the visible area of the framebuffer, in * pixels. */ unsigned int width; /** * @height: Logical height of the visible area of the framebuffer, in * pixels. */ unsigned int height; /** * @flags: Framebuffer flags like DRM_MODE_FB_INTERLACED or * DRM_MODE_FB_MODIFIERS. */ int flags; /** * @filp_head: Placed on &drm_file.fbs, protected by &drm_file.fbs_lock. */ struct list_head filp_head; /** * @obj: GEM objects backing the framebuffer, one per plane (optional). * * This is used by the GEM framebuffer helpers, see e.g. * drm_gem_fb_create(). */ struct drm_gem_object *obj[DRM_FORMAT_MAX_PLANES]; }; #define obj_to_fb(x) container_of(x, struct drm_framebuffer, base) int drm_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *fb, const struct drm_framebuffer_funcs *funcs); struct drm_framebuffer *drm_framebuffer_lookup(struct drm_device *dev, struct drm_file *file_priv, uint32_t id); void drm_framebuffer_remove(struct drm_framebuffer *fb); void drm_framebuffer_cleanup(struct drm_framebuffer *fb); void drm_framebuffer_unregister_private(struct drm_framebuffer *fb); /** * drm_framebuffer_get - acquire a framebuffer reference * @fb: DRM framebuffer * * This function increments the framebuffer's reference count. */ static inline void drm_framebuffer_get(struct drm_framebuffer *fb) { drm_mode_object_get(&fb->base); } /** * drm_framebuffer_put - release a framebuffer reference * @fb: DRM framebuffer * * This function decrements the framebuffer's reference count and frees the * framebuffer if the reference count drops to zero. */ static inline void drm_framebuffer_put(struct drm_framebuffer *fb) { drm_mode_object_put(&fb->base); } /** * drm_framebuffer_read_refcount - read the framebuffer reference count. * @fb: framebuffer * * This functions returns the framebuffer's reference count. */ static inline uint32_t drm_framebuffer_read_refcount(const struct drm_framebuffer *fb) { return kref_read(&fb->base.refcount); } /** * drm_framebuffer_assign - store a reference to the fb * @p: location to store framebuffer * @fb: new framebuffer (maybe NULL) * * This functions sets the location to store a reference to the framebuffer, * unreferencing the framebuffer that was previously stored in that location. */ static inline void drm_framebuffer_assign(struct drm_framebuffer **p, struct drm_framebuffer *fb) { if (fb) drm_framebuffer_get(fb); if (*p) drm_framebuffer_put(*p); *p = fb; } /* * drm_for_each_fb - iterate over all framebuffers * @fb: the loop cursor * @dev: the DRM device * * Iterate over all framebuffers of @dev. User must hold * &drm_mode_config.fb_lock. */ #define drm_for_each_fb(fb, dev) \ for (WARN_ON(!mutex_is_locked(&(dev)->mode_config.fb_lock)), \ fb = list_first_entry(&(dev)->mode_config.fb_list, \ struct drm_framebuffer, head); \ &fb->head != (&(dev)->mode_config.fb_list); \ fb = list_next_entry(fb, head)) /** * struct drm_afbc_framebuffer - a special afbc frame buffer object * * A derived class of struct drm_framebuffer, dedicated for afbc use cases. */ struct drm_afbc_framebuffer { /** * @base: base framebuffer structure. */ struct drm_framebuffer base; /** * @block_width: width of a single afbc block */ u32 block_width; /** * @block_height: height of a single afbc block */ u32 block_height; /** * @aligned_width: aligned frame buffer width */ u32 aligned_width; /** * @aligned_height: aligned frame buffer height */ u32 aligned_height; /** * @offset: offset of the first afbc header */ u32 offset; /** * @afbc_size: minimum size of afbc buffer */ u32 afbc_size; }; #define fb_to_afbc_fb(x) container_of(x, struct drm_afbc_framebuffer, base) #endif
101 42 101 14 96 42 42 42 38 5 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * * This file is part of the SCTP kernel implementation * * These are the state tables for the SCTP state machine. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Hui Huang <hui.huang@nokia.com> * Daisy Chang <daisyc@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/skbuff.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> static const struct sctp_sm_table_entry primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES]; static const struct sctp_sm_table_entry other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES]; static const struct sctp_sm_table_entry timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES]; static const struct sctp_sm_table_entry *sctp_chunk_event_lookup( struct net *net, enum sctp_cid cid, enum sctp_state state); static const struct sctp_sm_table_entry bug = { .fn = sctp_sf_bug, .name = "sctp_sf_bug" }; #define DO_LOOKUP(_max, _type, _table) \ ({ \ const struct sctp_sm_table_entry *rtn; \ \ if ((event_subtype._type > (_max))) { \ pr_warn("table %p possible attack: event %d exceeds max %d\n", \ _table, event_subtype._type, _max); \ rtn = &bug; \ } else \ rtn = &_table[event_subtype._type][(int)state]; \ \ rtn; \ }) const struct sctp_sm_table_entry *sctp_sm_lookup_event( struct net *net, enum sctp_event_type event_type, enum sctp_state state, union sctp_subtype event_subtype) { switch (event_type) { case SCTP_EVENT_T_CHUNK: return sctp_chunk_event_lookup(net, event_subtype.chunk, state); case SCTP_EVENT_T_TIMEOUT: return DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout, timeout_event_table); case SCTP_EVENT_T_OTHER: return DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other, other_event_table); case SCTP_EVENT_T_PRIMITIVE: return DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive, primitive_event_table); default: /* Yikes! We got an illegal event type. */ return &bug; } } #define TYPE_SCTP_FUNC(func) {.fn = func, .name = #func} #define TYPE_SCTP_DATA { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_data_fast_4_4), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_DATA */ #define TYPE_SCTP_INIT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1B_init), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_reshutack), \ } /* TYPE_SCTP_INIT */ #define TYPE_SCTP_INIT_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_3_initack), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1C_ack), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_INIT_ACK */ #define TYPE_SCTP_SACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_SACK */ #define TYPE_SCTP_HEARTBEAT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ /* This should not happen, but we are nice. */ \ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ } /* TYPE_SCTP_HEARTBEAT */ #define TYPE_SCTP_HEARTBEAT_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_HEARTBEAT_ACK */ #define TYPE_SCTP_ABORT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_pdiscard), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_abort), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_abort), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_abort), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_abort), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_abort), \ } /* TYPE_SCTP_ABORT */ #define TYPE_SCTP_SHUTDOWN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shut_ctsn), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_SHUTDOWN */ #define TYPE_SCTP_SHUTDOWN_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_violation), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \ } /* TYPE_SCTP_SHUTDOWN_ACK */ #define TYPE_SCTP_ERROR { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_err), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_operr_notify), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_operr_notify), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_operr_notify), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ERROR */ #define TYPE_SCTP_COOKIE_ECHO { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1D_ce), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ } /* TYPE_SCTP_COOKIE_ECHO */ #define TYPE_SCTP_COOKIE_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_5_1E_ca), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_COOKIE_ACK */ #define TYPE_SCTP_ECN_ECNE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ECN_ECNE */ #define TYPE_SCTP_ECN_CWR { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ECN_CWR */ #define TYPE_SCTP_SHUTDOWN_COMPLETE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_4_C), \ } /* TYPE_SCTP_SHUTDOWN_COMPLETE */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. * * For base protocol (RFC 2960). */ static const struct sctp_sm_table_entry chunk_event_table[SCTP_NUM_BASE_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_DATA, TYPE_SCTP_INIT, TYPE_SCTP_INIT_ACK, TYPE_SCTP_SACK, TYPE_SCTP_HEARTBEAT, TYPE_SCTP_HEARTBEAT_ACK, TYPE_SCTP_ABORT, TYPE_SCTP_SHUTDOWN, TYPE_SCTP_SHUTDOWN_ACK, TYPE_SCTP_ERROR, TYPE_SCTP_COOKIE_ECHO, TYPE_SCTP_COOKIE_ACK, TYPE_SCTP_ECN_ECNE, TYPE_SCTP_ECN_CWR, TYPE_SCTP_SHUTDOWN_COMPLETE, }; /* state_fn_t chunk_event_table[][] */ #define TYPE_SCTP_ASCONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ASCONF */ #define TYPE_SCTP_ASCONF_ACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ASCONF_ACK */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry addip_chunk_event_table[SCTP_NUM_ADDIP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_ASCONF, TYPE_SCTP_ASCONF_ACK, }; /*state_fn_t addip_chunk_event_table[][] */ #define TYPE_SCTP_FWD_TSN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn_fast), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_FWD_TSN */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_FWD_TSN, }; /*state_fn_t prsctp_chunk_event_table[][] */ #define TYPE_SCTP_RECONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_reconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_reconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_RECONF */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry reconf_chunk_event_table[SCTP_NUM_RECONF_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_RECONF, }; /*state_fn_t reconf_chunk_event_table[][] */ #define TYPE_SCTP_AUTH { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ } /* TYPE_SCTP_AUTH */ /* The primary index for this table is the chunk type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry auth_chunk_event_table[SCTP_NUM_AUTH_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_AUTH, }; /*state_fn_t auth_chunk_event_table[][] */ static const struct sctp_sm_table_entry pad_chunk_event_table[SCTP_STATE_NUM_STATES] = { /* SCTP_STATE_CLOSED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_COOKIE_WAIT */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_COOKIE_ECHOED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_ESTABLISHED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_PENDING */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_SENT */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_RECEIVED */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), /* SCTP_STATE_SHUTDOWN_ACK_SENT */ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), }; /* chunk pad */ static const struct sctp_sm_table_entry chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = { /* SCTP_STATE_CLOSED */ TYPE_SCTP_FUNC(sctp_sf_ootb), /* SCTP_STATE_COOKIE_WAIT */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_COOKIE_ECHOED */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_ESTABLISHED */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_PENDING */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_SENT */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_RECEIVED */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), /* SCTP_STATE_SHUTDOWN_ACK_SENT */ TYPE_SCTP_FUNC(sctp_sf_unk_chunk), }; /* chunk unknown */ #define TYPE_SCTP_PRIMITIVE_ASSOCIATE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asoc), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_not_impl), \ } /* TYPE_SCTP_PRIMITIVE_ASSOCIATE */ #define TYPE_SCTP_PRIMITIVE_SHUTDOWN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_shutdown), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_shutdown),\ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_prm_shutdown), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ } /* TYPE_SCTP_PRIMITIVE_SHUTDOWN */ #define TYPE_SCTP_PRIMITIVE_ABORT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_abort), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_abort), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_prm_abort), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_prm_abort), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_prm_abort), \ } /* TYPE_SCTP_PRIMITIVE_ABORT */ #define TYPE_SCTP_PRIMITIVE_SEND { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ } /* TYPE_SCTP_PRIMITIVE_SEND */ #define TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ } /* TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT */ #define TYPE_SCTP_PRIMITIVE_ASCONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ } /* TYPE_SCTP_PRIMITIVE_ASCONF */ #define TYPE_SCTP_PRIMITIVE_RECONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_error_closed), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ } /* TYPE_SCTP_PRIMITIVE_RECONF */ /* The primary index for this table is the primitive type. * The secondary index for this table is the state. */ static const struct sctp_sm_table_entry primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_PRIMITIVE_ASSOCIATE, TYPE_SCTP_PRIMITIVE_SHUTDOWN, TYPE_SCTP_PRIMITIVE_ABORT, TYPE_SCTP_PRIMITIVE_SEND, TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT, TYPE_SCTP_PRIMITIVE_ASCONF, TYPE_SCTP_PRIMITIVE_RECONF, }; #define TYPE_SCTP_OTHER_NO_PENDING_TSN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_no_pending_tsn), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_start_shutdown), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ } #define TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_icmp_abort), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ } static const struct sctp_sm_table_entry other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_OTHER_NO_PENDING_TSN, TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH, }; #define TYPE_SCTP_EVENT_TIMEOUT_NONE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_bug), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_t1_cookie_timer_expire), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T1_INIT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_t1_init_timer_expire), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T3_RTX { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T4_RTO { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_t4_timer_expire), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_SACK { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_autoclose_timer_expire), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_RECONF { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_send_reconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } #define TYPE_SCTP_EVENT_TIMEOUT_PROBE { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_WAIT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_COOKIE_ECHOED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_send_probe), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } static const struct sctp_sm_table_entry timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_EVENT_TIMEOUT_NONE, TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE, TYPE_SCTP_EVENT_TIMEOUT_T1_INIT, TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN, TYPE_SCTP_EVENT_TIMEOUT_T3_RTX, TYPE_SCTP_EVENT_TIMEOUT_T4_RTO, TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD, TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT, TYPE_SCTP_EVENT_TIMEOUT_RECONF, TYPE_SCTP_EVENT_TIMEOUT_PROBE, TYPE_SCTP_EVENT_TIMEOUT_SACK, TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE, }; static const struct sctp_sm_table_entry *sctp_chunk_event_lookup( struct net *net, enum sctp_cid cid, enum sctp_state state) { if (state > SCTP_STATE_MAX) return &bug; if (cid == SCTP_CID_I_DATA) cid = SCTP_CID_DATA; if (cid <= SCTP_CID_BASE_MAX) return &chunk_event_table[cid][state]; switch ((u16)cid) { case SCTP_CID_FWD_TSN: case SCTP_CID_I_FWD_TSN: return &prsctp_chunk_event_table[0][state]; case SCTP_CID_ASCONF: return &addip_chunk_event_table[0][state]; case SCTP_CID_ASCONF_ACK: return &addip_chunk_event_table[1][state]; case SCTP_CID_RECONF: return &reconf_chunk_event_table[0][state]; case SCTP_CID_AUTH: return &auth_chunk_event_table[0][state]; case SCTP_CID_PAD: return &pad_chunk_event_table[state]; } return &chunk_event_table_unknown[state]; }
57 56 57 57 3 3 3 3 55 55 56 8 8 55 55 55 55 55 55 55 55 56 56 3 1 55 55 55 55 55 55 48 55 6 55 55 55 55 55 2 55 55 55 55 55 56 53 53 53 53 53 53 53 51 53 9 55 55 54 55 39 38 55 55 55 54 47 47 47 1 1 1 47 1 47 46 47 6 6 6 6 6 6 6 6 6 6 2 130 88 2 2 131 65 64 65 65 64 62 62 65 64 18 58 63 1 18 1 1 63 4 4 3 3 3 1 1 54 54 53 54 54 54 54 54 47 39 2 43 10 43 43 43 43 43 24 41 41 41 26 39 41 40 2 38 41 54 47 53 54 4 4 4 3 54 4 4 3 2 2 1 1 2 2 4 4 4 3 4 4 4 4 7 7 7 7 7 21 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 // SPDX-License-Identifier: GPL-2.0-or-later /* Basic authentication token and access key management * * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/init.h> #include <linux/poison.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/workqueue.h> #include <linux/random.h> #include <linux/err.h> #include "internal.h" struct kmem_cache *key_jar; struct rb_root key_serial_tree; /* tree of keys indexed by serial */ DEFINE_SPINLOCK(key_serial_lock); struct rb_root key_user_tree; /* tree of quota records indexed by UID */ DEFINE_SPINLOCK(key_user_lock); unsigned int key_quota_root_maxkeys = 1000000; /* root's key count quota */ unsigned int key_quota_root_maxbytes = 25000000; /* root's key space quota */ unsigned int key_quota_maxkeys = 200; /* general key count quota */ unsigned int key_quota_maxbytes = 20000; /* general key space quota */ static LIST_HEAD(key_types_list); static DECLARE_RWSEM(key_types_sem); /* We serialise key instantiation and link */ DEFINE_MUTEX(key_construction_mutex); #ifdef KEY_DEBUGGING void __key_check(const struct key *key) { printk("__key_check: key %p {%08x} should be {%08x}\n", key, key->magic, KEY_DEBUG_MAGIC); BUG(); } #endif /* * Get the key quota record for a user, allocating a new record if one doesn't * already exist. */ struct key_user *key_user_lookup(kuid_t uid) { struct key_user *candidate = NULL, *user; struct rb_node *parent, **p; try_again: parent = NULL; p = &key_user_tree.rb_node; spin_lock(&key_user_lock); /* search the tree for a user record with a matching UID */ while (*p) { parent = *p; user = rb_entry(parent, struct key_user, node); if (uid_lt(uid, user->uid)) p = &(*p)->rb_left; else if (uid_gt(uid, user->uid)) p = &(*p)->rb_right; else goto found; } /* if we get here, we failed to find a match in the tree */ if (!candidate) { /* allocate a candidate user record if we don't already have * one */ spin_unlock(&key_user_lock); user = NULL; candidate = kmalloc(sizeof(struct key_user), GFP_KERNEL); if (unlikely(!candidate)) goto out; /* the allocation may have scheduled, so we need to repeat the * search lest someone else added the record whilst we were * asleep */ goto try_again; } /* if we get here, then the user record still hadn't appeared on the * second pass - so we use the candidate record */ refcount_set(&candidate->usage, 1); atomic_set(&candidate->nkeys, 0); atomic_set(&candidate->nikeys, 0); candidate->uid = uid; candidate->qnkeys = 0; candidate->qnbytes = 0; spin_lock_init(&candidate->lock); mutex_init(&candidate->cons_lock); rb_link_node(&candidate->node, parent, p); rb_insert_color(&candidate->node, &key_user_tree); spin_unlock(&key_user_lock); user = candidate; goto out; /* okay - we found a user record for this UID */ found: refcount_inc(&user->usage); spin_unlock(&key_user_lock); kfree(candidate); out: return user; } /* * Dispose of a user structure */ void key_user_put(struct key_user *user) { if (refcount_dec_and_lock(&user->usage, &key_user_lock)) { rb_erase(&user->node, &key_user_tree); spin_unlock(&key_user_lock); kfree(user); } } /* * Allocate a serial number for a key. These are assigned randomly to avoid * security issues through covert channel problems. */ static inline void key_alloc_serial(struct key *key) { struct rb_node *parent, **p; struct key *xkey; /* propose a random serial number and look for a hole for it in the * serial number tree */ do { get_random_bytes(&key->serial, sizeof(key->serial)); key->serial >>= 1; /* negative numbers are not permitted */ } while (key->serial < 3); spin_lock(&key_serial_lock); attempt_insertion: parent = NULL; p = &key_serial_tree.rb_node; while (*p) { parent = *p; xkey = rb_entry(parent, struct key, serial_node); if (key->serial < xkey->serial) p = &(*p)->rb_left; else if (key->serial > xkey->serial) p = &(*p)->rb_right; else goto serial_exists; } /* we've found a suitable hole - arrange for this key to occupy it */ rb_link_node(&key->serial_node, parent, p); rb_insert_color(&key->serial_node, &key_serial_tree); spin_unlock(&key_serial_lock); return; /* we found a key with the proposed serial number - walk the tree from * that point looking for the next unused serial number */ serial_exists: for (;;) { key->serial++; if (key->serial < 3) { key->serial = 3; goto attempt_insertion; } parent = rb_next(parent); if (!parent) goto attempt_insertion; xkey = rb_entry(parent, struct key, serial_node); if (key->serial < xkey->serial) goto attempt_insertion; } } /** * key_alloc - Allocate a key of the specified type. * @type: The type of key to allocate. * @desc: The key description to allow the key to be searched out. * @uid: The owner of the new key. * @gid: The group ID for the new key's group permissions. * @cred: The credentials specifying UID namespace. * @perm: The permissions mask of the new key. * @flags: Flags specifying quota properties. * @restrict_link: Optional link restriction for new keyrings. * * Allocate a key of the specified type with the attributes given. The key is * returned in an uninstantiated state and the caller needs to instantiate the * key before returning. * * The restrict_link structure (if not NULL) will be freed when the * keyring is destroyed, so it must be dynamically allocated. * * The user's key count quota is updated to reflect the creation of the key and * the user's key data quota has the default for the key type reserved. The * instantiation function should amend this as necessary. If insufficient * quota is available, -EDQUOT will be returned. * * The LSM security modules can prevent a key being created, in which case * -EACCES will be returned. * * Returns a pointer to the new key if successful and an error code otherwise. * * Note that the caller needs to ensure the key type isn't uninstantiated. * Internally this can be done by locking key_types_sem. Externally, this can * be done by either never unregistering the key type, or making sure * key_alloc() calls don't race with module unloading. */ struct key *key_alloc(struct key_type *type, const char *desc, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link) { struct key_user *user = NULL; struct key *key; size_t desclen, quotalen; int ret; unsigned long irqflags; key = ERR_PTR(-EINVAL); if (!desc || !*desc) goto error; if (type->vet_description) { ret = type->vet_description(desc); if (ret < 0) { key = ERR_PTR(ret); goto error; } } desclen = strlen(desc); quotalen = desclen + 1 + type->def_datalen; /* get hold of the key tracking for this user */ user = key_user_lookup(uid); if (!user) goto no_memory_1; /* check that the user's quota permits allocation of another key and * its description */ if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) { unsigned maxkeys = uid_eq(uid, GLOBAL_ROOT_UID) ? key_quota_root_maxkeys : key_quota_maxkeys; unsigned maxbytes = uid_eq(uid, GLOBAL_ROOT_UID) ? key_quota_root_maxbytes : key_quota_maxbytes; spin_lock_irqsave(&user->lock, irqflags); if (!(flags & KEY_ALLOC_QUOTA_OVERRUN)) { if (user->qnkeys + 1 > maxkeys || user->qnbytes + quotalen > maxbytes || user->qnbytes + quotalen < user->qnbytes) goto no_quota; } user->qnkeys++; user->qnbytes += quotalen; spin_unlock_irqrestore(&user->lock, irqflags); } /* allocate and initialise the key and its description */ key = kmem_cache_zalloc(key_jar, GFP_KERNEL); if (!key) goto no_memory_2; key->index_key.desc_len = desclen; key->index_key.description = kmemdup(desc, desclen + 1, GFP_KERNEL); if (!key->index_key.description) goto no_memory_3; key->index_key.type = type; key_set_index_key(&key->index_key); refcount_set(&key->usage, 1); init_rwsem(&key->sem); lockdep_set_class(&key->sem, &type->lock_class); key->user = user; key->quotalen = quotalen; key->datalen = type->def_datalen; key->uid = uid; key->gid = gid; key->perm = perm; key->expiry = TIME64_MAX; key->restrict_link = restrict_link; key->last_used_at = ktime_get_real_seconds(); if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) key->flags |= 1 << KEY_FLAG_IN_QUOTA; if (flags & KEY_ALLOC_BUILT_IN) key->flags |= 1 << KEY_FLAG_BUILTIN; if (flags & KEY_ALLOC_UID_KEYRING) key->flags |= 1 << KEY_FLAG_UID_KEYRING; if (flags & KEY_ALLOC_SET_KEEP) key->flags |= 1 << KEY_FLAG_KEEP; #ifdef KEY_DEBUGGING key->magic = KEY_DEBUG_MAGIC; #endif /* let the security module know about the key */ ret = security_key_alloc(key, cred, flags); if (ret < 0) goto security_error; /* publish the key by giving it a serial number */ refcount_inc(&key->domain_tag->usage); atomic_inc(&user->nkeys); key_alloc_serial(key); error: return key; security_error: kfree(key->description); kmem_cache_free(key_jar, key); if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) { spin_lock_irqsave(&user->lock, irqflags); user->qnkeys--; user->qnbytes -= quotalen; spin_unlock_irqrestore(&user->lock, irqflags); } key_user_put(user); key = ERR_PTR(ret); goto error; no_memory_3: kmem_cache_free(key_jar, key); no_memory_2: if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) { spin_lock_irqsave(&user->lock, irqflags); user->qnkeys--; user->qnbytes -= quotalen; spin_unlock_irqrestore(&user->lock, irqflags); } key_user_put(user); no_memory_1: key = ERR_PTR(-ENOMEM); goto error; no_quota: spin_unlock_irqrestore(&user->lock, irqflags); key_user_put(user); key = ERR_PTR(-EDQUOT); goto error; } EXPORT_SYMBOL(key_alloc); /** * key_payload_reserve - Adjust data quota reservation for the key's payload * @key: The key to make the reservation for. * @datalen: The amount of data payload the caller now wants. * * Adjust the amount of the owning user's key data quota that a key reserves. * If the amount is increased, then -EDQUOT may be returned if there isn't * enough free quota available. * * If successful, 0 is returned. */ int key_payload_reserve(struct key *key, size_t datalen) { int delta = (int)datalen - key->datalen; int ret = 0; key_check(key); /* contemplate the quota adjustment */ if (delta != 0 && test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) { unsigned maxbytes = uid_eq(key->user->uid, GLOBAL_ROOT_UID) ? key_quota_root_maxbytes : key_quota_maxbytes; unsigned long flags; spin_lock_irqsave(&key->user->lock, flags); if (delta > 0 && (key->user->qnbytes + delta > maxbytes || key->user->qnbytes + delta < key->user->qnbytes)) { ret = -EDQUOT; } else { key->user->qnbytes += delta; key->quotalen += delta; } spin_unlock_irqrestore(&key->user->lock, flags); } /* change the recorded data length if that didn't generate an error */ if (ret == 0) key->datalen = datalen; return ret; } EXPORT_SYMBOL(key_payload_reserve); /* * Change the key state to being instantiated. */ static void mark_key_instantiated(struct key *key, int reject_error) { /* Commit the payload before setting the state; barrier versus * key_read_state(). */ smp_store_release(&key->state, (reject_error < 0) ? reject_error : KEY_IS_POSITIVE); } /* * Instantiate a key and link it into the target keyring atomically. Must be * called with the target keyring's semaphore writelocked. The target key's * semaphore need not be locked as instantiation is serialised by * key_construction_mutex. */ static int __key_instantiate_and_link(struct key *key, struct key_preparsed_payload *prep, struct key *keyring, struct key *authkey, struct assoc_array_edit **_edit) { int ret, awaken; key_check(key); key_check(keyring); awaken = 0; ret = -EBUSY; mutex_lock(&key_construction_mutex); /* can't instantiate twice */ if (key->state == KEY_IS_UNINSTANTIATED) { /* instantiate the key */ ret = key->type->instantiate(key, prep); if (ret == 0) { /* mark the key as being instantiated */ atomic_inc(&key->user->nikeys); mark_key_instantiated(key, 0); notify_key(key, NOTIFY_KEY_INSTANTIATED, 0); if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) awaken = 1; /* and link it into the destination keyring */ if (keyring) { if (test_bit(KEY_FLAG_KEEP, &keyring->flags)) set_bit(KEY_FLAG_KEEP, &key->flags); __key_link(keyring, key, _edit); } /* disable the authorisation key */ if (authkey) key_invalidate(authkey); if (prep->expiry != TIME64_MAX) key_set_expiry(key, prep->expiry); } } mutex_unlock(&key_construction_mutex); /* wake up anyone waiting for a key to be constructed */ if (awaken) wake_up_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT); return ret; } /** * key_instantiate_and_link - Instantiate a key and link it into the keyring. * @key: The key to instantiate. * @data: The data to use to instantiate the keyring. * @datalen: The length of @data. * @keyring: Keyring to create a link in on success (or NULL). * @authkey: The authorisation token permitting instantiation. * * Instantiate a key that's in the uninstantiated state using the provided data * and, if successful, link it in to the destination keyring if one is * supplied. * * If successful, 0 is returned, the authorisation token is revoked and anyone * waiting for the key is woken up. If the key was already instantiated, * -EBUSY will be returned. */ int key_instantiate_and_link(struct key *key, const void *data, size_t datalen, struct key *keyring, struct key *authkey) { struct key_preparsed_payload prep; struct assoc_array_edit *edit = NULL; int ret; memset(&prep, 0, sizeof(prep)); prep.orig_description = key->description; prep.data = data; prep.datalen = datalen; prep.quotalen = key->type->def_datalen; prep.expiry = TIME64_MAX; if (key->type->preparse) { ret = key->type->preparse(&prep); if (ret < 0) goto error; } if (keyring) { ret = __key_link_lock(keyring, &key->index_key); if (ret < 0) goto error; ret = __key_link_begin(keyring, &key->index_key, &edit); if (ret < 0) goto error_link_end; if (keyring->restrict_link && keyring->restrict_link->check) { struct key_restriction *keyres = keyring->restrict_link; ret = keyres->check(keyring, key->type, &prep.payload, keyres->key); if (ret < 0) goto error_link_end; } } ret = __key_instantiate_and_link(key, &prep, keyring, authkey, &edit); error_link_end: if (keyring) __key_link_end(keyring, &key->index_key, edit); error: if (key->type->preparse) key->type->free_preparse(&prep); return ret; } EXPORT_SYMBOL(key_instantiate_and_link); /** * key_reject_and_link - Negatively instantiate a key and link it into the keyring. * @key: The key to instantiate. * @timeout: The timeout on the negative key. * @error: The error to return when the key is hit. * @keyring: Keyring to create a link in on success (or NULL). * @authkey: The authorisation token permitting instantiation. * * Negatively instantiate a key that's in the uninstantiated state and, if * successful, set its timeout and stored error and link it in to the * destination keyring if one is supplied. The key and any links to the key * will be automatically garbage collected after the timeout expires. * * Negative keys are used to rate limit repeated request_key() calls by causing * them to return the stored error code (typically ENOKEY) until the negative * key expires. * * If successful, 0 is returned, the authorisation token is revoked and anyone * waiting for the key is woken up. If the key was already instantiated, * -EBUSY will be returned. */ int key_reject_and_link(struct key *key, unsigned timeout, unsigned error, struct key *keyring, struct key *authkey) { struct assoc_array_edit *edit = NULL; int ret, awaken, link_ret = 0; key_check(key); key_check(keyring); awaken = 0; ret = -EBUSY; if (keyring) { if (keyring->restrict_link) return -EPERM; link_ret = __key_link_lock(keyring, &key->index_key); if (link_ret == 0) { link_ret = __key_link_begin(keyring, &key->index_key, &edit); if (link_ret < 0) __key_link_end(keyring, &key->index_key, edit); } } mutex_lock(&key_construction_mutex); /* can't instantiate twice */ if (key->state == KEY_IS_UNINSTANTIATED) { /* mark the key as being negatively instantiated */ atomic_inc(&key->user->nikeys); mark_key_instantiated(key, -error); notify_key(key, NOTIFY_KEY_INSTANTIATED, -error); key_set_expiry(key, ktime_get_real_seconds() + timeout); if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) awaken = 1; ret = 0; /* and link it into the destination keyring */ if (keyring && link_ret == 0) __key_link(keyring, key, &edit); /* disable the authorisation key */ if (authkey) key_invalidate(authkey); } mutex_unlock(&key_construction_mutex); if (keyring && link_ret == 0) __key_link_end(keyring, &key->index_key, edit); /* wake up anyone waiting for a key to be constructed */ if (awaken) wake_up_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT); return ret == 0 ? link_ret : ret; } EXPORT_SYMBOL(key_reject_and_link); /** * key_put - Discard a reference to a key. * @key: The key to discard a reference from. * * Discard a reference to a key, and when all the references are gone, we * schedule the cleanup task to come and pull it out of the tree in process * context at some later time. */ void key_put(struct key *key) { if (key) { key_check(key); if (refcount_dec_and_test(&key->usage)) { unsigned long flags; /* deal with the user's key tracking and quota */ if (test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) { spin_lock_irqsave(&key->user->lock, flags); key->user->qnkeys--; key->user->qnbytes -= key->quotalen; spin_unlock_irqrestore(&key->user->lock, flags); } schedule_work(&key_gc_work); } } } EXPORT_SYMBOL(key_put); /* * Find a key by its serial number. */ struct key *key_lookup(key_serial_t id) { struct rb_node *n; struct key *key; spin_lock(&key_serial_lock); /* search the tree for the specified key */ n = key_serial_tree.rb_node; while (n) { key = rb_entry(n, struct key, serial_node); if (id < key->serial) n = n->rb_left; else if (id > key->serial) n = n->rb_right; else goto found; } not_found: key = ERR_PTR(-ENOKEY); goto error; found: /* A key is allowed to be looked up only if someone still owns a * reference to it - otherwise it's awaiting the gc. */ if (!refcount_inc_not_zero(&key->usage)) goto not_found; error: spin_unlock(&key_serial_lock); return key; } EXPORT_SYMBOL(key_lookup); /* * Find and lock the specified key type against removal. * * We return with the sem read-locked if successful. If the type wasn't * available -ENOKEY is returned instead. */ struct key_type *key_type_lookup(const char *type) { struct key_type *ktype; down_read(&key_types_sem); /* look up the key type to see if it's one of the registered kernel * types */ list_for_each_entry(ktype, &key_types_list, link) { if (strcmp(ktype->name, type) == 0) goto found_kernel_type; } up_read(&key_types_sem); ktype = ERR_PTR(-ENOKEY); found_kernel_type: return ktype; } void key_set_timeout(struct key *key, unsigned timeout) { time64_t expiry = TIME64_MAX; /* make the changes with the locks held to prevent races */ down_write(&key->sem); if (timeout > 0) expiry = ktime_get_real_seconds() + timeout; key_set_expiry(key, expiry); up_write(&key->sem); } EXPORT_SYMBOL_GPL(key_set_timeout); /* * Unlock a key type locked by key_type_lookup(). */ void key_type_put(struct key_type *ktype) { up_read(&key_types_sem); } /* * Attempt to update an existing key. * * The key is given to us with an incremented refcount that we need to discard * if we get an error. */ static inline key_ref_t __key_update(key_ref_t key_ref, struct key_preparsed_payload *prep) { struct key *key = key_ref_to_ptr(key_ref); int ret; /* need write permission on the key to update it */ ret = key_permission(key_ref, KEY_NEED_WRITE); if (ret < 0) goto error; ret = -EEXIST; if (!key->type->update) goto error; down_write(&key->sem); ret = key->type->update(key, prep); if (ret == 0) { /* Updating a negative key positively instantiates it */ mark_key_instantiated(key, 0); notify_key(key, NOTIFY_KEY_UPDATED, 0); } up_write(&key->sem); if (ret < 0) goto error; out: return key_ref; error: key_put(key); key_ref = ERR_PTR(ret); goto out; } /* * Create or potentially update a key. The combined logic behind * key_create_or_update() and key_create() */ static key_ref_t __key_create_or_update(key_ref_t keyring_ref, const char *type, const char *description, const void *payload, size_t plen, key_perm_t perm, unsigned long flags, bool allow_update) { struct keyring_index_key index_key = { .description = description, }; struct key_preparsed_payload prep; struct assoc_array_edit *edit = NULL; const struct cred *cred = current_cred(); struct key *keyring, *key = NULL; key_ref_t key_ref; int ret; struct key_restriction *restrict_link = NULL; /* look up the key type to see if it's one of the registered kernel * types */ index_key.type = key_type_lookup(type); if (IS_ERR(index_key.type)) { key_ref = ERR_PTR(-ENODEV); goto error; } key_ref = ERR_PTR(-EINVAL); if (!index_key.type->instantiate || (!index_key.description && !index_key.type->preparse)) goto error_put_type; keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (!(flags & KEY_ALLOC_BYPASS_RESTRICTION)) restrict_link = keyring->restrict_link; key_ref = ERR_PTR(-ENOTDIR); if (keyring->type != &key_type_keyring) goto error_put_type; memset(&prep, 0, sizeof(prep)); prep.orig_description = description; prep.data = payload; prep.datalen = plen; prep.quotalen = index_key.type->def_datalen; prep.expiry = TIME64_MAX; if (index_key.type->preparse) { ret = index_key.type->preparse(&prep); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_free_prep; } if (!index_key.description) index_key.description = prep.description; key_ref = ERR_PTR(-EINVAL); if (!index_key.description) goto error_free_prep; } index_key.desc_len = strlen(index_key.description); key_set_index_key(&index_key); ret = __key_link_lock(keyring, &index_key); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_free_prep; } ret = __key_link_begin(keyring, &index_key, &edit); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_link_end; } if (restrict_link && restrict_link->check) { ret = restrict_link->check(keyring, index_key.type, &prep.payload, restrict_link->key); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_link_end; } } /* if we're going to allocate a new key, we're going to have * to modify the keyring */ ret = key_permission(keyring_ref, KEY_NEED_WRITE); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_link_end; } /* if it's requested and possible to update this type of key, search * for an existing key of the same type and description in the * destination keyring and update that instead if possible */ if (allow_update) { if (index_key.type->update) { key_ref = find_key_to_update(keyring_ref, &index_key); if (key_ref) goto found_matching_key; } } else { key_ref = find_key_to_update(keyring_ref, &index_key); if (key_ref) { key_ref_put(key_ref); key_ref = ERR_PTR(-EEXIST); goto error_link_end; } } /* if the client doesn't provide, decide on the permissions we want */ if (perm == KEY_PERM_UNDEF) { perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; perm |= KEY_USR_VIEW; if (index_key.type->read) perm |= KEY_POS_READ; if (index_key.type == &key_type_keyring || index_key.type->update) perm |= KEY_POS_WRITE; } /* allocate a new key */ key = key_alloc(index_key.type, index_key.description, cred->fsuid, cred->fsgid, cred, perm, flags, NULL); if (IS_ERR(key)) { key_ref = ERR_CAST(key); goto error_link_end; } /* instantiate it and link it into the target keyring */ ret = __key_instantiate_and_link(key, &prep, keyring, NULL, &edit); if (ret < 0) { key_put(key); key_ref = ERR_PTR(ret); goto error_link_end; } security_key_post_create_or_update(keyring, key, payload, plen, flags, true); key_ref = make_key_ref(key, is_key_possessed(keyring_ref)); error_link_end: __key_link_end(keyring, &index_key, edit); error_free_prep: if (index_key.type->preparse) index_key.type->free_preparse(&prep); error_put_type: key_type_put(index_key.type); error: return key_ref; found_matching_key: /* we found a matching key, so we're going to try to update it * - we can drop the locks first as we have the key pinned */ __key_link_end(keyring, &index_key, edit); key = key_ref_to_ptr(key_ref); if (test_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) { ret = wait_for_key_construction(key, true); if (ret < 0) { key_ref_put(key_ref); key_ref = ERR_PTR(ret); goto error_free_prep; } } key_ref = __key_update(key_ref, &prep); if (!IS_ERR(key_ref)) security_key_post_create_or_update(keyring, key, payload, plen, flags, false); goto error_free_prep; } /** * key_create_or_update - Update or create and instantiate a key. * @keyring_ref: A pointer to the destination keyring with possession flag. * @type: The type of key. * @description: The searchable description for the key. * @payload: The data to use to instantiate or update the key. * @plen: The length of @payload. * @perm: The permissions mask for a new key. * @flags: The quota flags for a new key. * * Search the destination keyring for a key of the same description and if one * is found, update it, otherwise create and instantiate a new one and create a * link to it from that keyring. * * If perm is KEY_PERM_UNDEF then an appropriate key permissions mask will be * concocted. * * Returns a pointer to the new key if successful, -ENODEV if the key type * wasn't available, -ENOTDIR if the keyring wasn't a keyring, -EACCES if the * caller isn't permitted to modify the keyring or the LSM did not permit * creation of the key. * * On success, the possession flag from the keyring ref will be tacked on to * the key ref before it is returned. */ key_ref_t key_create_or_update(key_ref_t keyring_ref, const char *type, const char *description, const void *payload, size_t plen, key_perm_t perm, unsigned long flags) { return __key_create_or_update(keyring_ref, type, description, payload, plen, perm, flags, true); } EXPORT_SYMBOL(key_create_or_update); /** * key_create - Create and instantiate a key. * @keyring_ref: A pointer to the destination keyring with possession flag. * @type: The type of key. * @description: The searchable description for the key. * @payload: The data to use to instantiate or update the key. * @plen: The length of @payload. * @perm: The permissions mask for a new key. * @flags: The quota flags for a new key. * * Create and instantiate a new key and link to it from the destination keyring. * * If perm is KEY_PERM_UNDEF then an appropriate key permissions mask will be * concocted. * * Returns a pointer to the new key if successful, -EEXIST if a key with the * same description already exists, -ENODEV if the key type wasn't available, * -ENOTDIR if the keyring wasn't a keyring, -EACCES if the caller isn't * permitted to modify the keyring or the LSM did not permit creation of the * key. * * On success, the possession flag from the keyring ref will be tacked on to * the key ref before it is returned. */ key_ref_t key_create(key_ref_t keyring_ref, const char *type, const char *description, const void *payload, size_t plen, key_perm_t perm, unsigned long flags) { return __key_create_or_update(keyring_ref, type, description, payload, plen, perm, flags, false); } EXPORT_SYMBOL(key_create); /** * key_update - Update a key's contents. * @key_ref: The pointer (plus possession flag) to the key. * @payload: The data to be used to update the key. * @plen: The length of @payload. * * Attempt to update the contents of a key with the given payload data. The * caller must be granted Write permission on the key. Negative keys can be * instantiated by this method. * * Returns 0 on success, -EACCES if not permitted and -EOPNOTSUPP if the key * type does not support updating. The key type may return other errors. */ int key_update(key_ref_t key_ref, const void *payload, size_t plen) { struct key_preparsed_payload prep; struct key *key = key_ref_to_ptr(key_ref); int ret; key_check(key); /* the key must be writable */ ret = key_permission(key_ref, KEY_NEED_WRITE); if (ret < 0) return ret; /* attempt to update it if supported */ if (!key->type->update) return -EOPNOTSUPP; memset(&prep, 0, sizeof(prep)); prep.data = payload; prep.datalen = plen; prep.quotalen = key->type->def_datalen; prep.expiry = TIME64_MAX; if (key->type->preparse) { ret = key->type->preparse(&prep); if (ret < 0) goto error; } down_write(&key->sem); ret = key->type->update(key, &prep); if (ret == 0) { /* Updating a negative key positively instantiates it */ mark_key_instantiated(key, 0); notify_key(key, NOTIFY_KEY_UPDATED, 0); } up_write(&key->sem); error: if (key->type->preparse) key->type->free_preparse(&prep); return ret; } EXPORT_SYMBOL(key_update); /** * key_revoke - Revoke a key. * @key: The key to be revoked. * * Mark a key as being revoked and ask the type to free up its resources. The * revocation timeout is set and the key and all its links will be * automatically garbage collected after key_gc_delay amount of time if they * are not manually dealt with first. */ void key_revoke(struct key *key) { time64_t time; key_check(key); /* make sure no one's trying to change or use the key when we mark it * - we tell lockdep that we might nest because we might be revoking an * authorisation key whilst holding the sem on a key we've just * instantiated */ down_write_nested(&key->sem, 1); if (!test_and_set_bit(KEY_FLAG_REVOKED, &key->flags)) { notify_key(key, NOTIFY_KEY_REVOKED, 0); if (key->type->revoke) key->type->revoke(key); /* set the death time to no more than the expiry time */ time = ktime_get_real_seconds(); if (key->revoked_at == 0 || key->revoked_at > time) { key->revoked_at = time; key_schedule_gc(key->revoked_at + key_gc_delay); } } up_write(&key->sem); } EXPORT_SYMBOL(key_revoke); /** * key_invalidate - Invalidate a key. * @key: The key to be invalidated. * * Mark a key as being invalidated and have it cleaned up immediately. The key * is ignored by all searches and other operations from this point. */ void key_invalidate(struct key *key) { kenter("%d", key_serial(key)); key_check(key); if (!test_bit(KEY_FLAG_INVALIDATED, &key->flags)) { down_write_nested(&key->sem, 1); if (!test_and_set_bit(KEY_FLAG_INVALIDATED, &key->flags)) { notify_key(key, NOTIFY_KEY_INVALIDATED, 0); key_schedule_gc_links(); } up_write(&key->sem); } } EXPORT_SYMBOL(key_invalidate); /** * generic_key_instantiate - Simple instantiation of a key from preparsed data * @key: The key to be instantiated * @prep: The preparsed data to load. * * Instantiate a key from preparsed data. We assume we can just copy the data * in directly and clear the old pointers. * * This can be pointed to directly by the key type instantiate op pointer. */ int generic_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { int ret; pr_devel("==>%s()\n", __func__); ret = key_payload_reserve(key, prep->quotalen); if (ret == 0) { rcu_assign_keypointer(key, prep->payload.data[0]); key->payload.data[1] = prep->payload.data[1]; key->payload.data[2] = prep->payload.data[2]; key->payload.data[3] = prep->payload.data[3]; prep->payload.data[0] = NULL; prep->payload.data[1] = NULL; prep->payload.data[2] = NULL; prep->payload.data[3] = NULL; } pr_devel("<==%s() = %d\n", __func__, ret); return ret; } EXPORT_SYMBOL(generic_key_instantiate); /** * register_key_type - Register a type of key. * @ktype: The new key type. * * Register a new key type. * * Returns 0 on success or -EEXIST if a type of this name already exists. */ int register_key_type(struct key_type *ktype) { struct key_type *p; int ret; memset(&ktype->lock_class, 0, sizeof(ktype->lock_class)); ret = -EEXIST; down_write(&key_types_sem); /* disallow key types with the same name */ list_for_each_entry(p, &key_types_list, link) { if (strcmp(p->name, ktype->name) == 0) goto out; } /* store the type */ list_add(&ktype->link, &key_types_list); pr_notice("Key type %s registered\n", ktype->name); ret = 0; out: up_write(&key_types_sem); return ret; } EXPORT_SYMBOL(register_key_type); /** * unregister_key_type - Unregister a type of key. * @ktype: The key type. * * Unregister a key type and mark all the extant keys of this type as dead. * Those keys of this type are then destroyed to get rid of their payloads and * they and their links will be garbage collected as soon as possible. */ void unregister_key_type(struct key_type *ktype) { down_write(&key_types_sem); list_del_init(&ktype->link); downgrade_write(&key_types_sem); key_gc_keytype(ktype); pr_notice("Key type %s unregistered\n", ktype->name); up_read(&key_types_sem); } EXPORT_SYMBOL(unregister_key_type); /* * Initialise the key management state. */ void __init key_init(void) { /* allocate a slab in which we can store keys */ key_jar = kmem_cache_create("key_jar", sizeof(struct key), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); /* add the special key types */ list_add_tail(&key_type_keyring.link, &key_types_list); list_add_tail(&key_type_dead.link, &key_types_list); list_add_tail(&key_type_user.link, &key_types_list); list_add_tail(&key_type_logon.link, &key_types_list); /* record the root user tracking */ rb_link_node(&root_key_user.node, NULL, &key_user_tree.rb_node); rb_insert_color(&root_key_user.node, &key_user_tree); }
3 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 5 4 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 246 3 246 2 2 2 5 5 5 5 5 5 234 234 234 234 235 5 5 234 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" #define DEVLINK_PORT_FN_CAPS_VALID_MASK \ (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1) static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, [DEVLINK_PORT_FN_ATTR_STATE] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE, DEVLINK_PORT_FN_STATE_ACTIVE), [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK), [DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] = { .type = NLA_U32 }, }; #define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ WARN_ON_ONCE(!(devlink_port)->registered) #define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \ WARN_ON_ONCE((devlink_port)->registered) struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, unsigned int port_index) { return xa_load(&devlink->ports, port_index); } struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { if (attrs[DEVLINK_ATTR_PORT_INDEX]) { u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (!devlink_port) return ERR_PTR(-ENODEV); return devlink_port; } return ERR_PTR(-EINVAL); } struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_port_get_from_attrs(devlink, info->attrs); } static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps, u32 cap, bool is_enable) { caps->selector |= cap; if (is_enable) caps->value |= cap; } static int devlink_port_fn_roce_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_roce_get) return 0; err = devlink_port->ops->port_fn_roce_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable); return 0; } static int devlink_port_fn_migratable_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_migratable_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_migratable_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable); return 0; } static int devlink_port_fn_ipsec_crypto_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_ipsec_crypto_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_ipsec_crypto_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, is_enable); return 0; } static int devlink_port_fn_ipsec_packet_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_ipsec_packet_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_ipsec_packet_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_PACKET, is_enable); return 0; } static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { struct nla_bitfield32 caps = {}; int err; err = devlink_port_fn_roce_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_migratable_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_ipsec_crypto_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_ipsec_packet_fill(devlink_port, &caps, extack); if (err) return err; if (!caps.selector) return 0; err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value, caps.selector); if (err) return err; *msg_updated = true; return 0; } static int devlink_port_fn_max_io_eqs_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { u32 max_io_eqs; int err; if (!port->ops->port_fn_max_io_eqs_get) return 0; err = port->ops->port_fn_max_io_eqs_get(port, &max_io_eqs, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } err = nla_put_u32(msg, DEVLINK_PORT_FN_ATTR_MAX_IO_EQS, max_io_eqs); if (err) return err; *msg_updated = true; return 0; } int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) { if (devlink_nl_put_handle(msg, devlink_port->devlink)) return -EMSGSIZE; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) return -EMSGSIZE; return 0; } size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */ + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */ + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */ } static int devlink_nl_port_attrs_put(struct sk_buff *msg, struct devlink_port *devlink_port) { struct devlink_port_attrs *attrs = &devlink_port->attrs; if (!devlink_port->attrs_set) return 0; if (attrs->lanes) { if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) return -EMSGSIZE; } if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable)) return -EMSGSIZE; if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; switch (devlink_port->attrs.flavour) { case DEVLINK_PORT_FLAVOUR_PCI_PF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_pf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_VF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_vf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_SF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_sf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_sf.pf) || nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER, attrs->pci_sf.sf)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PHYSICAL: case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->phys.port_number)) return -EMSGSIZE; if (!attrs->split) return 0; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->phys.port_number)) return -EMSGSIZE; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, attrs->phys.split_subport_number)) return -EMSGSIZE; break; default: break; } return 0; } static int devlink_port_fn_hw_addr_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { u8 hw_addr[MAX_ADDR_LEN]; int hw_addr_len; int err; if (!port->ops->port_fn_hw_addr_get) return 0; err = port->ops->port_fn_hw_addr_get(port, hw_addr, &hw_addr_len, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); if (err) return err; *msg_updated = true; return 0; } static bool devlink_port_fn_state_valid(enum devlink_port_fn_state state) { return state == DEVLINK_PORT_FN_STATE_INACTIVE || state == DEVLINK_PORT_FN_STATE_ACTIVE; } static bool devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) { return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED || opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; } static int devlink_port_fn_state_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { enum devlink_port_fn_opstate opstate; enum devlink_port_fn_state state; int err; if (!port->ops->port_fn_state_get) return 0; err = port->ops->port_fn_state_get(port, &state, &opstate, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (!devlink_port_fn_state_valid(state)) { WARN_ON_ONCE(1); NL_SET_ERR_MSG(extack, "Invalid state read from driver"); return -EINVAL; } if (!devlink_port_fn_opstate_valid(opstate)) { WARN_ON_ONCE(1); NL_SET_ERR_MSG(extack, "Invalid operational state read from driver"); return -EINVAL; } if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) || nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate)) return -EMSGSIZE; *msg_updated = true; return 0; } static int devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_migratable_set(devlink_port, enable, extack); } static int devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_roce_set(devlink_port, enable, extack); } static int devlink_port_fn_ipsec_crypto_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_ipsec_crypto_set(devlink_port, enable, extack); } static int devlink_port_fn_ipsec_packet_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_ipsec_packet_set(devlink_port, enable, extack); } static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, const struct nlattr *attr, struct netlink_ext_ack *extack) { struct nla_bitfield32 caps; u32 caps_value; int err; caps = nla_get_bitfield32(attr); caps_value = caps.value & caps.selector; if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) { err = devlink_port_fn_roce_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_ROCE, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { err = devlink_port_fn_mig_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_MIGRATABLE, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { err = devlink_port_fn_ipsec_crypto_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { err = devlink_port_fn_ipsec_packet_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_IPSEC_PACKET, extack); if (err) return err; } return 0; } static int devlink_port_fn_max_io_eqs_set(struct devlink_port *devlink_port, const struct nlattr *attr, struct netlink_ext_ack *extack) { u32 max_io_eqs; max_io_eqs = nla_get_u32(attr); return devlink_port->ops->port_fn_max_io_eqs_set(devlink_port, max_io_eqs, extack); } static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) { struct nlattr *function_attr; bool msg_updated = false; int err; function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION); if (!function_attr) return -EMSGSIZE; err = devlink_port_fn_hw_addr_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_port_fn_caps_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_port_fn_max_io_eqs_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_rel_devlink_handle_put(msg, port->devlink, port->rel_index, DEVLINK_PORT_FN_ATTR_DEVLINK, &msg_updated); out: if (err || !msg_updated) nla_nest_cancel(msg, function_attr); else nla_nest_end(msg, function_attr); return err; } static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink_port *devlink_port, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct devlink *devlink = devlink_port->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) goto nla_put_failure; spin_lock_bh(&devlink_port->type_lock); if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) goto nla_put_failure_type_locked; if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET && nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE, devlink_port->desired_type)) goto nla_put_failure_type_locked; if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { if (devlink_port->type_eth.netdev && (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, devlink_port->type_eth.ifindex) || nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, devlink_port->type_eth.ifname))) goto nla_put_failure_type_locked; } if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { struct ib_device *ibdev = devlink_port->type_ib.ibdev; if (ibdev && nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME, ibdev->name)) goto nla_put_failure_type_locked; } spin_unlock_bh(&devlink_port->type_lock); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) goto nla_put_failure; if (devlink_port->linecard && nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, devlink_linecard_index(devlink_port->linecard))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure_type_locked: spin_unlock_bh(&devlink_port->type_lock); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_port_notify(struct devlink_port *devlink_port, enum devlink_command cmd) { struct devlink *devlink = devlink_port->devlink; struct devlink_obj_desc desc; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } devlink_nl_obj_desc_init(&desc, devlink); devlink_nl_obj_desc_port_set(&desc, devlink_port); devlink_nl_notify_send_desc(devlink, msg, &desc); } static void devlink_ports_notify(struct devlink *devlink, enum devlink_command cmd) { struct devlink_port *devlink_port; unsigned long port_index; xa_for_each(&devlink->ports, port_index, devlink_port) devlink_port_notify(devlink_port, cmd); } void devlink_ports_notify_register(struct devlink *devlink) { devlink_ports_notify(devlink, DEVLINK_CMD_PORT_NEW); } void devlink_ports_notify_unregister(struct devlink *devlink) { devlink_ports_notify(devlink, DEVLINK_CMD_PORT_DEL); } int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_port *devlink_port; unsigned long port_index; int err = 0; xa_for_each_start(&devlink->ports, port_index, devlink_port, state->idx) { err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, cb->extack); if (err) { state->idx = port_index; break; } } return err; } int devlink_nl_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_port_get_dump_one); } static int devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type port_type) { int err; if (!devlink_port->ops->port_type_set) return -EOPNOTSUPP; if (port_type == devlink_port->type) return 0; err = devlink_port->ops->port_type_set(devlink_port, port_type); if (err) return err; devlink_port->desired_type = port_type; devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } static int devlink_port_function_hw_addr_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { const u8 *hw_addr; int hw_addr_len; hw_addr = nla_data(attr); hw_addr_len = nla_len(attr); if (hw_addr_len > MAX_ADDR_LEN) { NL_SET_ERR_MSG(extack, "Port function hardware address too long"); return -EINVAL; } if (port->type == DEVLINK_PORT_TYPE_ETH) { if (hw_addr_len != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Address must be 6 bytes for Ethernet device"); return -EINVAL; } if (!is_unicast_ether_addr(hw_addr)) { NL_SET_ERR_MSG(extack, "Non-unicast hardware address unsupported"); return -EINVAL; } } return port->ops->port_fn_hw_addr_set(port, hw_addr, hw_addr_len, extack); } static int devlink_port_fn_state_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { enum devlink_port_fn_state state; state = nla_get_u8(attr); return port->ops->port_fn_state_set(port, state, extack); } static int devlink_port_function_validate(struct devlink_port *devlink_port, struct nlattr **tb, struct netlink_ext_ack *extack) { const struct devlink_port_ops *ops = devlink_port->ops; struct nlattr *attr; if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] && !ops->port_fn_hw_addr_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], "Port doesn't support function attributes"); return -EOPNOTSUPP; } if (tb[DEVLINK_PORT_FN_ATTR_STATE] && !ops->port_fn_state_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_STATE], "Function does not support state setting"); return -EOPNOTSUPP; } attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; if (attr) { struct nla_bitfield32 caps; caps = nla_get_bitfield32(attr); if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE && !ops->port_fn_roce_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support RoCE function attribute"); return -EOPNOTSUPP; } if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { if (!ops->port_fn_migratable_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support migratable function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "migratable function attribute supported for VFs only"); return -EOPNOTSUPP; } } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { if (!ops->port_fn_ipsec_crypto_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support ipsec_crypto function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "ipsec_crypto function attribute supported for VFs only"); return -EOPNOTSUPP; } } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { if (!ops->port_fn_ipsec_packet_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support ipsec_packet function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "ipsec_packet function attribute supported for VFs only"); return -EOPNOTSUPP; } } } if (tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] && !ops->port_fn_max_io_eqs_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS], "Function does not support max_io_eqs setting"); return -EOPNOTSUPP; } return 0; } static int devlink_port_function_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1]; int err; err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr, devlink_function_nl_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Fail to parse port function attributes"); return err; } err = devlink_port_function_validate(port, tb, extack); if (err) return err; attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; if (attr) { err = devlink_port_function_hw_addr_set(port, attr, extack); if (err) return err; } attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; if (attr) { err = devlink_port_fn_caps_set(port, attr, extack); if (err) return err; } attr = tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS]; if (attr) { err = devlink_port_fn_max_io_eqs_set(port, attr, extack); if (err) return err; } /* Keep this as the last function attribute set, so that when * multiple port function attributes are set along with state, * Those can be applied first before activating the state. */ attr = tb[DEVLINK_PORT_FN_ATTR_STATE]; if (attr) err = devlink_port_fn_state_set(port, attr, extack); if (!err) devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); return err; } int devlink_nl_port_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; int err; if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { enum devlink_port_type port_type; port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); err = devlink_port_type_set(devlink_port, port_type); if (err) return err; } if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) { struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION]; struct netlink_ext_ack *extack = info->extack; err = devlink_port_function_set(devlink_port, attr, extack); if (err) return err; } return 0; } int devlink_nl_port_split_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; u32 count; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT)) return -EINVAL; if (!devlink_port->ops->port_split) return -EOPNOTSUPP; count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); if (!devlink_port->attrs.splittable) { /* Split ports cannot be split. */ if (devlink_port->attrs.split) NL_SET_ERR_MSG(info->extack, "Port cannot be split further"); else NL_SET_ERR_MSG(info->extack, "Port cannot be split"); return -EINVAL; } if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) { NL_SET_ERR_MSG(info->extack, "Invalid split count"); return -EINVAL; } return devlink_port->ops->port_split(devlink, devlink_port, count, info->extack); } int devlink_nl_port_unsplit_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; if (!devlink_port->ops->port_unsplit) return -EOPNOTSUPP; return devlink_port->ops->port_unsplit(devlink, devlink_port, info->extack); } int devlink_nl_port_new_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink_port_new_attrs new_attrs = {}; struct devlink *devlink = info->user_ptr[0]; struct devlink_port *devlink_port; struct sk_buff *msg; int err; if (!devlink->ops->port_new) return -EOPNOTSUPP; if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] || !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) { NL_SET_ERR_MSG(extack, "Port flavour or PCI PF are not specified"); return -EINVAL; } new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]); new_attrs.pfnum = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]); if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { /* Port index of the new port being created by driver. */ new_attrs.port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); new_attrs.port_index_valid = true; } if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) { new_attrs.controller = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]); new_attrs.controller_valid = true; } if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF && info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) { new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]); new_attrs.sfnum_valid = true; } err = devlink->ops->port_new(devlink, &new_attrs, extack, &devlink_port); if (err) return err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { err = -ENOMEM; goto err_out_port_del; } err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, info->snd_portid, info->snd_seq, 0, NULL); if (WARN_ON_ONCE(err)) goto err_out_msg_free; err = genlmsg_reply(msg, info); if (err) goto err_out_port_del; return 0; err_out_msg_free: nlmsg_free(msg); err_out_port_del: devlink_port->ops->port_del(devlink, devlink_port, NULL); return err; } int devlink_nl_port_del_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; if (!devlink_port->ops->port_del) return -EOPNOTSUPP; return devlink_port->ops->port_del(devlink, devlink_port, extack); } static void devlink_port_type_warn(struct work_struct *work) { struct devlink_port *port = container_of(to_delayed_work(work), struct devlink_port, type_warn_dw); dev_warn(port->devlink->dev, "Type was not set for devlink port."); } static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) { /* Ignore CPU and DSA flavours. */ return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA && devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED; } #define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600) static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port) { if (!devlink_port_type_should_warn(devlink_port)) return; /* Schedule a work to WARN in case driver does not set port * type within timeout. */ schedule_delayed_work(&devlink_port->type_warn_dw, DEVLINK_PORT_TYPE_WARN_TIMEOUT); } static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) { if (!devlink_port_type_should_warn(devlink_port)) return; cancel_delayed_work_sync(&devlink_port->type_warn_dw); } /** * devlink_port_init() - Init devlink port * * @devlink: devlink * @devlink_port: devlink port * * Initialize essential stuff that is needed for functions * that may be called before devlink port registration. * Call to this function is optional and not needed * in case the driver does not use such functions. */ void devlink_port_init(struct devlink *devlink, struct devlink_port *devlink_port) { if (devlink_port->initialized) return; devlink_port->devlink = devlink; INIT_LIST_HEAD(&devlink_port->region_list); devlink_port->initialized = true; } EXPORT_SYMBOL_GPL(devlink_port_init); /** * devlink_port_fini() - Deinitialize devlink port * * @devlink_port: devlink port * * Deinitialize essential stuff that is in use for functions * that may be called after devlink port unregistration. * Call to this function is optional and not needed * in case the driver does not use such functions. */ void devlink_port_fini(struct devlink_port *devlink_port) { WARN_ON(!list_empty(&devlink_port->region_list)); } EXPORT_SYMBOL_GPL(devlink_port_fini); static const struct devlink_port_ops devlink_port_dummy_ops = {}; /** * devl_port_register_with_ops() - Register devlink port * * @devlink: devlink * @devlink_port: devlink port * @port_index: driver-specific numerical identifier of the port * @ops: port ops * * Register devlink port with provided port index. User can use * any indexing, even hw-related one. devlink_port structure * is convenient to be embedded inside user driver private structure. * Note that the caller should take care of zeroing the devlink_port * structure. */ int devl_port_register_with_ops(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index, const struct devlink_port_ops *ops) { int err; devl_assert_locked(devlink); ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port_init(devlink, devlink_port); devlink_port->registered = true; devlink_port->index = port_index; devlink_port->ops = ops ? ops : &devlink_port_dummy_ops; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); if (err) { devlink_port->registered = false; return err; } INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } EXPORT_SYMBOL_GPL(devl_port_register_with_ops); /** * devlink_port_register_with_ops - Register devlink port * * @devlink: devlink * @devlink_port: devlink port * @port_index: driver-specific numerical identifier of the port * @ops: port ops * * Register devlink port with provided port index. User can use * any indexing, even hw-related one. devlink_port structure * is convenient to be embedded inside user driver private structure. * Note that the caller should take care of zeroing the devlink_port * structure. * * Context: Takes and release devlink->lock <mutex>. */ int devlink_port_register_with_ops(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index, const struct devlink_port_ops *ops) { int err; devl_lock(devlink); err = devl_port_register_with_ops(devlink, devlink_port, port_index, ops); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_port_register_with_ops); /** * devl_port_unregister() - Unregister devlink port * * @devlink_port: devlink port */ void devl_port_unregister(struct devlink_port *devlink_port) { lockdep_assert_held(&devlink_port->devlink->lock); WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET); devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); xa_erase(&devlink_port->devlink->ports, devlink_port->index); WARN_ON(!list_empty(&devlink_port->reporter_list)); devlink_port->registered = false; } EXPORT_SYMBOL_GPL(devl_port_unregister); /** * devlink_port_unregister - Unregister devlink port * * @devlink_port: devlink port * * Context: Takes and release devlink->lock <mutex>. */ void devlink_port_unregister(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; devl_lock(devlink); devl_port_unregister(devlink_port); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_port_unregister); static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, struct net_device *netdev) { const struct net_device_ops *ops = netdev->netdev_ops; /* If driver registers devlink port, it should set devlink port * attributes accordingly so the compat functions are called * and the original ops are not used. */ if (ops->ndo_get_phys_port_name) { /* Some drivers use the same set of ndos for netdevs * that have devlink_port registered and also for * those who don't. Make sure that ndo_get_phys_port_name * returns -EOPNOTSUPP here in case it is defined. * Warn if not. */ char name[IFNAMSIZ]; int err; err = ops->ndo_get_phys_port_name(netdev, name, sizeof(name)); WARN_ON(err != -EOPNOTSUPP); } if (ops->ndo_get_port_parent_id) { /* Some drivers use the same set of ndos for netdevs * that have devlink_port registered and also for * those who don't. Make sure that ndo_get_port_parent_id * returns -EOPNOTSUPP here in case it is defined. * Warn if not. */ struct netdev_phys_item_id ppid; int err; err = ops->ndo_get_port_parent_id(netdev, &ppid); WARN_ON(err != -EOPNOTSUPP); } } static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, void *type_dev) { struct net_device *netdev = type_dev; ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); if (type == DEVLINK_PORT_TYPE_NOTSET) { devlink_port_type_warn_schedule(devlink_port); } else { devlink_port_type_warn_cancel(devlink_port); if (type == DEVLINK_PORT_TYPE_ETH && netdev) devlink_port_type_netdev_checks(devlink_port, netdev); } spin_lock_bh(&devlink_port->type_lock); devlink_port->type = type; switch (type) { case DEVLINK_PORT_TYPE_ETH: devlink_port->type_eth.netdev = netdev; if (netdev) { ASSERT_RTNL(); devlink_port->type_eth.ifindex = netdev->ifindex; BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) != sizeof(netdev->name)); strcpy(devlink_port->type_eth.ifname, netdev->name); } break; case DEVLINK_PORT_TYPE_IB: devlink_port->type_ib.ibdev = type_dev; break; default: break; } spin_unlock_bh(&devlink_port->type_lock); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } /** * devlink_port_type_eth_set - Set port type to Ethernet * * @devlink_port: devlink port * * If driver is calling this, most likely it is doing something wrong. */ void devlink_port_type_eth_set(struct devlink_port *devlink_port) { dev_warn(devlink_port->devlink->dev, "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); } EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); /** * devlink_port_type_ib_set - Set port type to InfiniBand * * @devlink_port: devlink port * @ibdev: related IB device */ void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev) { __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev); } EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); /** * devlink_port_type_clear - Clear port type * * @devlink_port: devlink port * * If driver is calling this for clearing Ethernet type, most likely * it is doing something wrong. */ void devlink_port_type_clear(struct devlink_port *devlink_port) { if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) dev_warn(devlink_port->devlink->dev, "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); } EXPORT_SYMBOL_GPL(devlink_port_type_clear); int devlink_port_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct devlink_port *devlink_port = netdev->devlink_port; struct devlink *devlink; if (!devlink_port) return NOTIFY_OK; devlink = devlink_port->devlink; switch (event) { case NETDEV_POST_INIT: /* Set the type but not netdev pointer. It is going to be set * later on by NETDEV_REGISTER event. Happens once during * netdevice register */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); break; case NETDEV_REGISTER: case NETDEV_CHANGENAME: if (devlink_net(devlink) != dev_net(netdev)) return NOTIFY_OK; /* Set the netdev on top of previously set type. Note this * event happens also during net namespace change so here * we take into account netdev pointer appearing in this * namespace. */ __devlink_port_type_set(devlink_port, devlink_port->type, netdev); break; case NETDEV_UNREGISTER: if (devlink_net(devlink) != dev_net(netdev)) return NOTIFY_OK; /* Clear netdev pointer, but not the type. This event happens * also during net namespace change so we need to clear * pointer to netdev that is going to another net namespace. */ __devlink_port_type_set(devlink_port, devlink_port->type, NULL); break; case NETDEV_PRE_UNINIT: /* Clear the type and the netdev pointer. Happens one during * netdevice unregister. */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); break; } return NOTIFY_OK; } static int __devlink_port_attrs_set(struct devlink_port *devlink_port, enum devlink_port_flavour flavour) { struct devlink_port_attrs *attrs = &devlink_port->attrs; devlink_port->attrs_set = true; attrs->flavour = flavour; if (attrs->switch_id.id_len) { devlink_port->switch_port = true; if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN)) attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN; } else { devlink_port->switch_port = false; } return 0; } /** * devlink_port_attrs_set - Set port attributes * * @devlink_port: devlink port * @attrs: devlink port attrs */ void devlink_port_attrs_set(struct devlink_port *devlink_port, struct devlink_port_attrs *attrs) { int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) return; WARN_ON(attrs->splittable && attrs->split); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); /** * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) return; attrs->pci_pf.controller = controller; attrs->pci_pf.pf = pf; attrs->pci_pf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); /** * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @vf: associated VF of a PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) return; attrs->pci_vf.controller = controller; attrs->pci_vf.pf = pf; attrs->pci_vf.vf = vf; attrs->pci_vf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); /** * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @sf: associated SF of a PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u32 sf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); if (ret) return; attrs->pci_sf.controller = controller; attrs->pci_sf.pf = pf; attrs->pci_sf.sf = sf; attrs->pci_sf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); static void devlink_port_rel_notify_cb(struct devlink *devlink, u32 port_index) { struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (!devlink_port) return; devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } static void devlink_port_rel_cleanup_cb(struct devlink *devlink, u32 port_index, u32 rel_index) { struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (devlink_port && devlink_port->rel_index == rel_index) devlink_port->rel_index = 0; } /** * devl_port_fn_devlink_set - Attach peer devlink * instance to port function. * @devlink_port: devlink port * @fn_devlink: devlink instance to attach */ int devl_port_fn_devlink_set(struct devlink_port *devlink_port, struct devlink *fn_devlink) { ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); if (WARN_ON(devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_SF || devlink_port->attrs.pci_sf.external)) return -EINVAL; return devlink_rel_nested_in_add(&devlink_port->rel_index, devlink_port->devlink->index, devlink_port->index, devlink_port_rel_notify_cb, devlink_port_rel_cleanup_cb, fn_devlink); } EXPORT_SYMBOL_GPL(devl_port_fn_devlink_set); /** * devlink_port_linecard_set - Link port with a linecard * * @devlink_port: devlink port * @linecard: devlink linecard */ void devlink_port_linecard_set(struct devlink_port *devlink_port, struct devlink_linecard *linecard) { ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port->linecard = linecard; } EXPORT_SYMBOL_GPL(devlink_port_linecard_set); static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int n = 0; if (!devlink_port->attrs_set) return -EOPNOTSUPP; switch (attrs->flavour) { case DEVLINK_PORT_FLAVOUR_PHYSICAL: if (devlink_port->linecard) n = snprintf(name, len, "l%u", devlink_linecard_index(devlink_port->linecard)); if (n < len) n += snprintf(name + n, len - n, "p%u", attrs->phys.port_number); if (n < len && attrs->split) n += snprintf(name + n, len - n, "s%u", attrs->phys.split_subport_number); break; case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: case DEVLINK_PORT_FLAVOUR_UNUSED: /* As CPU and DSA ports do not have a netdevice associated * case should not ever happen. */ WARN_ON(1); return -EINVAL; case DEVLINK_PORT_FLAVOUR_PCI_PF: if (attrs->pci_pf.external) { n = snprintf(name, len, "c%u", attrs->pci_pf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%u", attrs->pci_pf.pf); break; case DEVLINK_PORT_FLAVOUR_PCI_VF: if (attrs->pci_vf.external) { n = snprintf(name, len, "c%u", attrs->pci_vf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%uvf%u", attrs->pci_vf.pf, attrs->pci_vf.vf); break; case DEVLINK_PORT_FLAVOUR_PCI_SF: if (attrs->pci_sf.external) { n = snprintf(name, len, "c%u", attrs->pci_sf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, attrs->pci_sf.sf); break; case DEVLINK_PORT_FLAVOUR_VIRTUAL: return -EOPNOTSUPP; } if (n >= len) return -EINVAL; return 0; } int devlink_compat_phys_port_name_get(struct net_device *dev, char *name, size_t len) { struct devlink_port *devlink_port; /* RTNL mutex is held here which ensures that devlink_port * instance cannot disappear in the middle. No need to take * any devlink lock as only permanent values are accessed. */ ASSERT_RTNL(); devlink_port = dev->devlink_port; if (!devlink_port) return -EOPNOTSUPP; return __devlink_port_phys_port_name_get(devlink_port, name, len); } int devlink_compat_switch_id_get(struct net_device *dev, struct netdev_phys_item_id *ppid) { struct devlink_port *devlink_port; /* Caller must hold RTNL mutex or reference to dev, which ensures that * devlink_port instance cannot disappear in the middle. No need to take * any devlink lock as only permanent values are accessed. */ devlink_port = dev->devlink_port; if (!devlink_port || !devlink_port->switch_port) return -EOPNOTSUPP; memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid)); return 0; }
15 17 1 10 8 10 10 10 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RDS_RDS_H #define _RDS_RDS_H #include <net/sock.h> #include <linux/scatterlist.h> #include <linux/highmem.h> #include <rdma/rdma_cm.h> #include <linux/mutex.h> #include <linux/rds.h> #include <linux/rhashtable.h> #include <linux/refcount.h> #include <linux/in6.h> #include "info.h" /* * RDS Network protocol version */ #define RDS_PROTOCOL_3_0 0x0300 #define RDS_PROTOCOL_3_1 0x0301 #define RDS_PROTOCOL_4_0 0x0400 #define RDS_PROTOCOL_4_1 0x0401 #define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1 #define RDS_PROTOCOL_MAJOR(v) ((v) >> 8) #define RDS_PROTOCOL_MINOR(v) ((v) & 255) #define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) #define RDS_PROTOCOL_COMPAT_VERSION RDS_PROTOCOL_3_1 /* The following ports, 16385, 18634, 18635, are registered with IANA as * the ports to be used for RDS over TCP and UDP. Currently, only RDS over * TCP and RDS over IB/RDMA are implemented. 18634 is the historical value * used for the RDMA_CM listener port. RDS/TCP uses port 16385. After * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept * to ensure compatibility with older RDS modules. Those ports are defined * in each transport's header file. */ #define RDS_PORT 18634 #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif #ifdef RDS_DEBUG #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) #else /* sigh, pr_debug() causes unused variable warnings */ static inline __printf(1, 2) void rdsdebug(char *fmt, ...) { } #endif #define RDS_FRAG_SHIFT 12 #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) /* Used to limit both RDMA and non-RDMA RDS message to 1MB */ #define RDS_MAX_MSG_SIZE ((unsigned int)(1 << 20)) #define RDS_CONG_MAP_BYTES (65536 / 8) #define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) #define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) struct rds_cong_map { struct rb_node m_rb_node; struct in6_addr m_addr; wait_queue_head_t m_waitq; struct list_head m_conn_list; unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; }; /* * This is how we will track the connection state: * A connection is always in one of the following * states. Updates to the state are atomic and imply * a memory barrier. */ enum { RDS_CONN_DOWN = 0, RDS_CONN_CONNECTING, RDS_CONN_DISCONNECTING, RDS_CONN_UP, RDS_CONN_RESETTING, RDS_CONN_ERROR, }; /* Bits for c_flags */ #define RDS_LL_SEND_FULL 0 #define RDS_RECONNECT_PENDING 1 #define RDS_IN_XMIT 2 #define RDS_RECV_REFILL 3 #define RDS_DESTROY_PENDING 4 /* Max number of multipaths per RDS connection. Must be a power of 2 */ #define RDS_MPATH_WORKERS 8 #define RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \ (rs)->rs_hash_initval) & ((n) - 1)) #define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr)) /* Per mpath connection state */ struct rds_conn_path { struct rds_connection *cp_conn; struct rds_message *cp_xmit_rm; unsigned long cp_xmit_sg; unsigned int cp_xmit_hdr_off; unsigned int cp_xmit_data_off; unsigned int cp_xmit_atomic_sent; unsigned int cp_xmit_rdma_sent; unsigned int cp_xmit_data_sent; spinlock_t cp_lock; /* protect msg queues */ u64 cp_next_tx_seq; struct list_head cp_send_queue; struct list_head cp_retrans; u64 cp_next_rx_seq; void *cp_transport_data; atomic_t cp_state; unsigned long cp_send_gen; unsigned long cp_flags; unsigned long cp_reconnect_jiffies; struct delayed_work cp_send_w; struct delayed_work cp_recv_w; struct delayed_work cp_conn_w; struct work_struct cp_down_w; struct mutex cp_cm_lock; /* protect cp_state & cm */ wait_queue_head_t cp_waitq; unsigned int cp_unacked_packets; unsigned int cp_unacked_bytes; unsigned int cp_index; }; /* One rds_connection per RDS address pair */ struct rds_connection { struct hlist_node c_hash_node; struct in6_addr c_laddr; struct in6_addr c_faddr; int c_dev_if; /* ifindex used for this conn */ int c_bound_if; /* ifindex of c_laddr */ unsigned int c_loopback:1, c_isv6:1, c_ping_triggered:1, c_pad_to_32:29; int c_npaths; struct rds_connection *c_passive; struct rds_transport *c_trans; struct rds_cong_map *c_lcong; struct rds_cong_map *c_fcong; /* Protocol version */ unsigned int c_proposed_version; unsigned int c_version; possible_net_t c_net; /* TOS */ u8 c_tos; struct list_head c_map_item; unsigned long c_map_queued; struct rds_conn_path *c_path; wait_queue_head_t c_hs_waitq; /* handshake waitq */ u32 c_my_gen_num; u32 c_peer_gen_num; }; static inline struct net *rds_conn_net(struct rds_connection *conn) { return read_pnet(&conn->c_net); } static inline void rds_conn_net_set(struct rds_connection *conn, struct net *net) { write_pnet(&conn->c_net, net); } #define RDS_FLAG_CONG_BITMAP 0x01 #define RDS_FLAG_ACK_REQUIRED 0x02 #define RDS_FLAG_RETRANSMITTED 0x04 #define RDS_MAX_ADV_CREDIT 255 /* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping * probe to exchange control information before establishing a connection. * Currently the control information that is exchanged is the number of * supported paths. If the peer is a legacy (older kernel revision) peer, * it would return a pong message without additional control information * that would then alert the sender that the peer was an older rev. */ #define RDS_FLAG_PROBE_PORT 1 #define RDS_HS_PROBE(sport, dport) \ ((sport == RDS_FLAG_PROBE_PORT && dport == 0) || \ (sport == 0 && dport == RDS_FLAG_PROBE_PORT)) /* * Maximum space available for extension headers. */ #define RDS_HEADER_EXT_SPACE 16 struct rds_header { __be64 h_sequence; __be64 h_ack; __be32 h_len; __be16 h_sport; __be16 h_dport; u8 h_flags; u8 h_credit; u8 h_padding[4]; __sum16 h_csum; u8 h_exthdr[RDS_HEADER_EXT_SPACE]; }; /* * Reserved - indicates end of extensions */ #define RDS_EXTHDR_NONE 0 /* * This extension header is included in the very * first message that is sent on a new connection, * and identifies the protocol level. This will help * rolling updates if a future change requires breaking * the protocol. * NB: This is no longer true for IB, where we do a version * negotiation during the connection setup phase (protocol * version information is included in the RDMA CM private data). */ #define RDS_EXTHDR_VERSION 1 struct rds_ext_header_version { __be32 h_version; }; /* * This extension header is included in the RDS message * chasing an RDMA operation. */ #define RDS_EXTHDR_RDMA 2 struct rds_ext_header_rdma { __be32 h_rdma_rkey; }; /* * This extension header tells the peer about the * destination <R_Key,offset> of the requested RDMA * operation. */ #define RDS_EXTHDR_RDMA_DEST 3 struct rds_ext_header_rdma_dest { __be32 h_rdma_rkey; __be32 h_rdma_offset; }; /* Extension header announcing number of paths. * Implicit length = 2 bytes. */ #define RDS_EXTHDR_NPATHS 5 #define RDS_EXTHDR_GEN_NUM 6 #define __RDS_EXTHDR_MAX 16 /* for now */ #define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1) #define RDS_MSG_RX_HDR 0 #define RDS_MSG_RX_START 1 #define RDS_MSG_RX_END 2 #define RDS_MSG_RX_CMSG 3 /* The following values are whitelisted for usercopy */ struct rds_inc_usercopy { rds_rdma_cookie_t rdma_cookie; ktime_t rx_tstamp; }; struct rds_incoming { refcount_t i_refcount; struct list_head i_item; struct rds_connection *i_conn; struct rds_conn_path *i_conn_path; struct rds_header i_hdr; unsigned long i_rx_jiffies; struct in6_addr i_saddr; struct rds_inc_usercopy i_usercopy; u64 i_rx_lat_trace[RDS_RX_MAX_TRACES]; }; struct rds_mr { struct rb_node r_rb_node; struct kref r_kref; u32 r_key; /* A copy of the creation flags */ unsigned int r_use_once:1; unsigned int r_invalidate:1; unsigned int r_write:1; struct rds_sock *r_sock; /* back pointer to the socket that owns us */ struct rds_transport *r_trans; void *r_trans_private; }; static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) { return r_key | (((u64) offset) << 32); } static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) { return cookie; } static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) { return cookie >> 32; } /* atomic operation types */ #define RDS_ATOMIC_TYPE_CSWP 0 #define RDS_ATOMIC_TYPE_FADD 1 /* * m_sock_item and m_conn_item are on lists that are serialized under * conn->c_lock. m_sock_item has additional meaning in that once it is empty * the message will not be put back on the retransmit list after being sent. * messages that are canceled while being sent rely on this. * * m_inc is used by loopback so that it can pass an incoming message straight * back up into the rx path. It embeds a wire header which is also used by * the send path, which is kind of awkward. * * m_sock_item indicates the message's presence on a socket's send or receive * queue. m_rs will point to that socket. * * m_daddr is used by cancellation to prune messages to a given destination. * * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock * nesting. As paths iterate over messages on a sock, or conn, they must * also lock the conn, or sock, to remove the message from those lists too. * Testing the flag to determine if the message is still on the lists lets * us avoid testing the list_head directly. That means each path can use * the message's list_head to keep it on a local list while juggling locks * without confusing the other path. * * m_ack_seq is an optional field set by transports who need a different * sequence number range to invalidate. They can use this in a callback * that they pass to rds_send_drop_acked() to see if each message has been * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't * had ack_seq set yet. */ #define RDS_MSG_ON_SOCK 1 #define RDS_MSG_ON_CONN 2 #define RDS_MSG_HAS_ACK_SEQ 3 #define RDS_MSG_ACK_REQUIRED 4 #define RDS_MSG_RETRANSMITTED 5 #define RDS_MSG_MAPPED 6 #define RDS_MSG_PAGEVEC 7 #define RDS_MSG_FLUSH 8 struct rds_znotifier { struct mmpin z_mmp; u32 z_cookie; }; struct rds_msg_zcopy_info { struct list_head rs_zcookie_next; union { struct rds_znotifier znotif; struct rds_zcopy_cookies zcookies; }; }; struct rds_msg_zcopy_queue { struct list_head zcookie_head; spinlock_t lock; /* protects zcookie_head queue */ }; static inline void rds_message_zcopy_queue_init(struct rds_msg_zcopy_queue *q) { spin_lock_init(&q->lock); INIT_LIST_HEAD(&q->zcookie_head); } struct rds_iov_vector { struct rds_iovec *iov; int len; }; struct rds_iov_vector_arr { struct rds_iov_vector *vec; int len; int indx; int incr; }; struct rds_message { refcount_t m_refcount; struct list_head m_sock_item; struct list_head m_conn_item; struct rds_incoming m_inc; u64 m_ack_seq; struct in6_addr m_daddr; unsigned long m_flags; /* Never access m_rs without holding m_rs_lock. * Lock nesting is * rm->m_rs_lock * -> rs->rs_lock */ spinlock_t m_rs_lock; wait_queue_head_t m_flush_wait; struct rds_sock *m_rs; /* cookie to send to remote, in rds header */ rds_rdma_cookie_t m_rdma_cookie; unsigned int m_used_sgs; unsigned int m_total_sgs; void *m_final_op; struct { struct rm_atomic_op { int op_type; union { struct { uint64_t compare; uint64_t swap; uint64_t compare_mask; uint64_t swap_mask; } op_m_cswp; struct { uint64_t add; uint64_t nocarry_mask; } op_m_fadd; }; u32 op_rkey; u64 op_remote_addr; unsigned int op_notify:1; unsigned int op_recverr:1; unsigned int op_mapped:1; unsigned int op_silent:1; unsigned int op_active:1; struct scatterlist *op_sg; struct rds_notifier *op_notifier; struct rds_mr *op_rdma_mr; } atomic; struct rm_rdma_op { u32 op_rkey; u64 op_remote_addr; unsigned int op_write:1; unsigned int op_fence:1; unsigned int op_notify:1; unsigned int op_recverr:1; unsigned int op_mapped:1; unsigned int op_silent:1; unsigned int op_active:1; unsigned int op_bytes; unsigned int op_nents; unsigned int op_count; struct scatterlist *op_sg; struct rds_notifier *op_notifier; struct rds_mr *op_rdma_mr; u64 op_odp_addr; struct rds_mr *op_odp_mr; } rdma; struct rm_data_op { unsigned int op_active:1; unsigned int op_nents; unsigned int op_count; unsigned int op_dmasg; unsigned int op_dmaoff; struct rds_znotifier *op_mmp_znotifier; struct scatterlist *op_sg; } data; }; struct rds_conn_path *m_conn_path; }; /* * The RDS notifier is used (optionally) to tell the application about * completed RDMA operations. Rather than keeping the whole rds message * around on the queue, we allocate a small notifier that is put on the * socket's notifier_list. Notifications are delivered to the application * through control messages. */ struct rds_notifier { struct list_head n_list; uint64_t n_user_token; int n_status; }; /* Available as part of RDS core, so doesn't need to participate * in get_preferred transport etc */ #define RDS_TRANS_LOOP 3 /** * struct rds_transport - transport specific behavioural hooks * * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send * part of a message. The caller serializes on the send_sem so this * doesn't need to be reentrant for a given conn. The header must be * sent before the data payload. .xmit must be prepared to send a * message with no data payload. .xmit should return the number of * bytes that were sent down the connection, including header bytes. * Returning 0 tells the caller that it doesn't need to perform any * additional work now. This is usually the case when the transport has * filled the sending queue for its connection and will handle * triggering the rds thread to continue the send when space becomes * available. Returning -EAGAIN tells the caller to retry the send * immediately. Returning -ENOMEM tells the caller to retry the send at * some point in the future. * * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once * it returns the connection can not call rds_recv_incoming(). * This will only be called once after conn_connect returns * non-zero success and will The caller serializes this with * the send and connecting paths (xmit_* and conn_*). The * transport is responsible for other serialization, including * rds_recv_incoming(). This is called in process context but * should try hard not to block. */ struct rds_transport { char t_name[TRANSNAMSIZ]; struct list_head t_item; struct module *t_owner; unsigned int t_prefer_loopback:1, t_mp_capable:1; unsigned int t_type; int (*laddr_check)(struct net *net, const struct in6_addr *addr, __u32 scope_id); int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); void (*conn_free)(void *data); int (*conn_path_connect)(struct rds_conn_path *cp); void (*conn_path_shutdown)(struct rds_conn_path *conn); void (*xmit_path_prepare)(struct rds_conn_path *cp); void (*xmit_path_complete)(struct rds_conn_path *cp); int (*xmit)(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); int (*recv_path)(struct rds_conn_path *cp); int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to); void (*inc_free)(struct rds_incoming *inc); int (*cm_handle_connect)(struct rdma_cm_id *cm_id, struct rdma_cm_event *event, bool isv6); int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6); void (*cm_connect_complete)(struct rds_connection *conn, struct rdma_cm_event *event); unsigned int (*stats_info_copy)(struct rds_info_iterator *iter, unsigned int avail); void (*exit)(void); void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, struct rds_sock *rs, u32 *key_ret, struct rds_connection *conn, u64 start, u64 length, int need_odp); void (*sync_mr)(void *trans_private, int direction); void (*free_mr)(void *trans_private, int invalidate); void (*flush_mrs)(void); bool (*t_unloading)(struct rds_connection *conn); u8 (*get_tos_map)(u8 tos); }; /* Bind hash table key length. It is the sum of the size of a struct * in6_addr, a scope_id and a port. */ #define RDS_BOUND_KEY_LEN \ (sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16)) struct rds_sock { struct sock rs_sk; u64 rs_user_addr; u64 rs_user_bytes; /* * bound_addr used for both incoming and outgoing, no INADDR_ANY * support. */ struct rhash_head rs_bound_node; u8 rs_bound_key[RDS_BOUND_KEY_LEN]; struct sockaddr_in6 rs_bound_sin6; #define rs_bound_addr rs_bound_sin6.sin6_addr #define rs_bound_addr_v4 rs_bound_sin6.sin6_addr.s6_addr32[3] #define rs_bound_port rs_bound_sin6.sin6_port #define rs_bound_scope_id rs_bound_sin6.sin6_scope_id struct in6_addr rs_conn_addr; #define rs_conn_addr_v4 rs_conn_addr.s6_addr32[3] __be16 rs_conn_port; struct rds_transport *rs_transport; /* * rds_sendmsg caches the conn it used the last time around. * This helps avoid costly lookups. */ struct rds_connection *rs_conn; /* flag indicating we were congested or not */ int rs_congested; /* seen congestion (ENOBUFS) when sending? */ int rs_seen_congestion; /* rs_lock protects all these adjacent members before the newline */ spinlock_t rs_lock; struct list_head rs_send_queue; u32 rs_snd_bytes; int rs_rcv_bytes; struct list_head rs_notify_queue; /* currently used for failed RDMAs */ /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask * to decide whether the application should be woken up. * If not set, we use rs_cong_track to find out whether a cong map * update arrived. */ uint64_t rs_cong_mask; uint64_t rs_cong_notify; struct list_head rs_cong_list; unsigned long rs_cong_track; /* * rs_recv_lock protects the receive queue, and is * used to serialize with rds_release. */ rwlock_t rs_recv_lock; struct list_head rs_recv_queue; /* just for stats reporting */ struct list_head rs_item; /* these have their own lock */ spinlock_t rs_rdma_lock; struct rb_root rs_rdma_keys; /* Socket options - in case there will be more */ unsigned char rs_recverr, rs_cong_monitor; u32 rs_hash_initval; /* Socket receive path trace points*/ u8 rs_rx_traces; u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; struct rds_msg_zcopy_queue rs_zcookie_queue; u8 rs_tos; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) { return container_of(sk, struct rds_sock, rs_sk); } static inline struct sock *rds_rs_to_sk(struct rds_sock *rs) { return &rs->rs_sk; } /* * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value * to account for overhead. We don't account for overhead, we just apply * the number of payload bytes to the specified value. */ static inline int rds_sk_sndbuf(struct rds_sock *rs) { return rds_rs_to_sk(rs)->sk_sndbuf / 2; } static inline int rds_sk_rcvbuf(struct rds_sock *rs) { return rds_rs_to_sk(rs)->sk_rcvbuf / 2; } struct rds_statistics { uint64_t s_conn_reset; uint64_t s_recv_drop_bad_checksum; uint64_t s_recv_drop_old_seq; uint64_t s_recv_drop_no_sock; uint64_t s_recv_drop_dead_sock; uint64_t s_recv_deliver_raced; uint64_t s_recv_delivered; uint64_t s_recv_queued; uint64_t s_recv_immediate_retry; uint64_t s_recv_delayed_retry; uint64_t s_recv_ack_required; uint64_t s_recv_rdma_bytes; uint64_t s_recv_ping; uint64_t s_send_queue_empty; uint64_t s_send_queue_full; uint64_t s_send_lock_contention; uint64_t s_send_lock_queue_raced; uint64_t s_send_immediate_retry; uint64_t s_send_delayed_retry; uint64_t s_send_drop_acked; uint64_t s_send_ack_required; uint64_t s_send_queued; uint64_t s_send_rdma; uint64_t s_send_rdma_bytes; uint64_t s_send_pong; uint64_t s_page_remainder_hit; uint64_t s_page_remainder_miss; uint64_t s_copy_to_user; uint64_t s_copy_from_user; uint64_t s_cong_update_queued; uint64_t s_cong_update_received; uint64_t s_cong_send_error; uint64_t s_cong_send_blocked; uint64_t s_recv_bytes_added_to_socket; uint64_t s_recv_bytes_removed_from_socket; uint64_t s_send_stuck_rm; }; /* af_rds.c */ void rds_sock_addref(struct rds_sock *rs); void rds_sock_put(struct rds_sock *rs); void rds_wake_sk_sleep(struct rds_sock *rs); static inline void __rds_wake_sk_sleep(struct sock *sk) { wait_queue_head_t *waitq = sk_sleep(sk); if (!sock_flag(sk, SOCK_DEAD) && waitq) wake_up(waitq); } extern wait_queue_head_t rds_poll_waitq; /* bind.c */ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); void rds_remove_bound(struct rds_sock *rs); struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, __u32 scope_id); int rds_bind_lock_init(void); void rds_bind_lock_destroy(void); /* cong.c */ int rds_cong_get_maps(struct rds_connection *conn); void rds_cong_add_conn(struct rds_connection *conn); void rds_cong_remove_conn(struct rds_connection *conn); void rds_cong_set_bit(struct rds_cong_map *map, __be16 port); void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port); int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs); void rds_cong_queue_updates(struct rds_cong_map *map); void rds_cong_map_updated(struct rds_cong_map *map, uint64_t); int rds_cong_updated_since(unsigned long *recent); void rds_cong_add_socket(struct rds_sock *); void rds_cong_remove_socket(struct rds_sock *); void rds_cong_exit(void); struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); /* connection.c */ extern u32 rds_gen_num; int rds_conn_init(void); void rds_conn_exit(void); struct rds_connection *rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if); struct rds_connection *rds_conn_create_outgoing(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if); void rds_conn_shutdown(struct rds_conn_path *cpath); void rds_conn_destroy(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); void rds_conn_path_drop(struct rds_conn_path *cpath, bool destroy); void rds_conn_connect_if_down(struct rds_connection *conn); void rds_conn_path_connect_if_down(struct rds_conn_path *cp); void rds_check_all_paths(struct rds_connection *conn); void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), u64 *buffer, size_t item_len); __printf(2, 3) void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...); #define rds_conn_path_error(cp, fmt...) \ __rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt) static inline int rds_conn_path_transition(struct rds_conn_path *cp, int old, int new) { return atomic_cmpxchg(&cp->cp_state, old, new) == old; } static inline int rds_conn_transition(struct rds_connection *conn, int old, int new) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_transition(&conn->c_path[0], old, new); } static inline int rds_conn_path_state(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state); } static inline int rds_conn_state(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_state(&conn->c_path[0]); } static inline int rds_conn_path_up(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state) == RDS_CONN_UP; } static inline int rds_conn_path_down(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state) == RDS_CONN_DOWN; } static inline int rds_conn_up(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_up(&conn->c_path[0]); } static inline int rds_conn_path_connecting(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING; } static inline int rds_conn_connecting(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_connecting(&conn->c_path[0]); } /* message.c */ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, bool zcopy); struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq); int rds_message_add_extension(struct rds_header *hdr, unsigned int type, const void *data, unsigned int len); int rds_message_next_extension(struct rds_header *hdr, unsigned int *pos, void *buf, unsigned int *buflen); int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); void rds_message_addref(struct rds_message *rm); void rds_message_put(struct rds_message *rm); void rds_message_wait(struct rds_message *rm); void rds_message_unmapped(struct rds_message *rm); void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *info); static inline void rds_message_make_checksum(struct rds_header *hdr) { hdr->h_csum = 0; hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2); } static inline int rds_message_verify_checksum(const struct rds_header *hdr) { return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0; } /* page.c */ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, gfp_t gfp); void rds_page_exit(void); /* recv.c */ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, struct in6_addr *saddr); void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn, struct in6_addr *saddr); void rds_inc_put(struct rds_incoming *inc); void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp); int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags); void rds_clear_recv_queue(struct rds_sock *rs); int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg); void rds_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, __be32 saddr, __be32 daddr, int flip); void rds6_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, struct in6_addr *saddr, struct in6_addr *daddr, int flip); /* send.c */ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); void rds_send_path_reset(struct rds_conn_path *conn); int rds_send_xmit(struct rds_conn_path *cp); struct sockaddr_in; void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack, is_acked_func is_acked); void rds_send_ping(struct rds_connection *conn, int cp_index); int rds_send_pong(struct rds_conn_path *cp, __be16 dport); /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen); int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen); int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen); void rds_rdma_drop_keys(struct rds_sock *rs); int rds_rdma_extra_size(struct rds_rdma_args *args, struct rds_iov_vector *iov); int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg, struct rds_iov_vector *vec); int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); void rds_rdma_free_op(struct rm_rdma_op *ro); void rds_atomic_free_op(struct rm_atomic_op *ao); void rds_rdma_send_complete(struct rds_message *rm, int wc_status); void rds_atomic_send_complete(struct rds_message *rm, int wc_status); int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); void __rds_put_mr_final(struct kref *kref); static inline bool rds_destroy_pending(struct rds_connection *conn) { return !check_net(rds_conn_net(conn)) || (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn)); } enum { ODP_NOT_NEEDED, ODP_ZEROBASED, ODP_VIRTUAL }; /* stats.c */ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); #define rds_stats_inc_which(which, member) do { \ per_cpu(which, get_cpu()).member++; \ put_cpu(); \ } while (0) #define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member) #define rds_stats_add_which(which, member, count) do { \ per_cpu(which, get_cpu()).member += count; \ put_cpu(); \ } while (0) #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) int rds_stats_init(void); void rds_stats_exit(void); void rds_stats_info_copy(struct rds_info_iterator *iter, uint64_t *values, const char *const *names, size_t nr); /* sysctl.c */ int rds_sysctl_init(void); void rds_sysctl_exit(void); extern unsigned long rds_sysctl_sndbuf_min; extern unsigned long rds_sysctl_sndbuf_default; extern unsigned long rds_sysctl_sndbuf_max; extern unsigned long rds_sysctl_reconnect_min_jiffies; extern unsigned long rds_sysctl_reconnect_max_jiffies; extern unsigned int rds_sysctl_max_unacked_packets; extern unsigned int rds_sysctl_max_unacked_bytes; extern unsigned int rds_sysctl_ping_enable; extern unsigned long rds_sysctl_trace_flags; extern unsigned int rds_sysctl_trace_level; /* threads.c */ int rds_threads_init(void); void rds_threads_exit(void); extern struct workqueue_struct *rds_wq; void rds_queue_reconnect(struct rds_conn_path *cp); void rds_connect_worker(struct work_struct *); void rds_shutdown_worker(struct work_struct *); void rds_send_worker(struct work_struct *); void rds_recv_worker(struct work_struct *); void rds_connect_path_complete(struct rds_conn_path *conn, int curr); void rds_connect_complete(struct rds_connection *conn); int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2); /* transport.c */ void rds_trans_register(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans); struct rds_transport *rds_trans_get_preferred(struct net *net, const struct in6_addr *addr, __u32 scope_id); void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); struct rds_transport *rds_trans_get(int t_type); #endif
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _TCP_AO_H #define _TCP_AO_H #define TCP_AO_KEY_ALIGN 1 #define __tcp_ao_key_align __aligned(TCP_AO_KEY_ALIGN) union tcp_ao_addr { struct in_addr a4; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr a6; #endif }; struct tcp_ao_hdr { u8 kind; u8 length; u8 keyid; u8 rnext_keyid; }; struct tcp_ao_counters { atomic64_t pkt_good; atomic64_t pkt_bad; atomic64_t key_not_found; atomic64_t ao_required; atomic64_t dropped_icmp; }; struct tcp_ao_key { struct hlist_node node; union tcp_ao_addr addr; u8 key[TCP_AO_MAXKEYLEN] __tcp_ao_key_align; unsigned int tcp_sigpool_id; unsigned int digest_size; int l3index; u8 prefixlen; u8 family; u8 keylen; u8 keyflags; u8 sndid; u8 rcvid; u8 maclen; struct rcu_head rcu; atomic64_t pkt_good; atomic64_t pkt_bad; u8 traffic_keys[]; }; static inline u8 *rcv_other_key(struct tcp_ao_key *key) { return key->traffic_keys; } static inline u8 *snd_other_key(struct tcp_ao_key *key) { return key->traffic_keys + key->digest_size; } static inline int tcp_ao_maclen(const struct tcp_ao_key *key) { return key->maclen; } /* Use tcp_ao_len_aligned() for TCP header calculations */ static inline int tcp_ao_len(const struct tcp_ao_key *key) { return tcp_ao_maclen(key) + sizeof(struct tcp_ao_hdr); } static inline int tcp_ao_len_aligned(const struct tcp_ao_key *key) { return round_up(tcp_ao_len(key), 4); } static inline unsigned int tcp_ao_digest_size(struct tcp_ao_key *key) { return key->digest_size; } static inline int tcp_ao_sizeof_key(const struct tcp_ao_key *key) { return sizeof(struct tcp_ao_key) + (key->digest_size << 1); } struct tcp_ao_info { /* List of tcp_ao_key's */ struct hlist_head head; /* current_key and rnext_key are maintained on sockets * in TCP_AO_ESTABLISHED states. * Their purpose is to cache keys on established connections, * saving needless lookups. Never dereference any of them from * listen sockets. * ::current_key may change in RX to the key that was requested by * the peer, please use READ_ONCE()/WRITE_ONCE() in order to avoid * load/store tearing. * Do the same for ::rnext_key, if you don't hold socket lock * (it's changed only by userspace request in setsockopt()). */ struct tcp_ao_key *current_key; struct tcp_ao_key *rnext_key; struct tcp_ao_counters counters; u32 ao_required :1, accept_icmps :1, __unused :30; __be32 lisn; __be32 risn; /* Sequence Number Extension (SNE) are upper 4 bytes for SEQ, * that protect TCP-AO connection from replayed old TCP segments. * See RFC5925 (6.2). * In order to get correct SNE, there's a helper tcp_ao_compute_sne(). * It needs SEQ basis to understand whereabouts are lower SEQ numbers. * According to that basis vector, it can provide incremented SNE * when SEQ rolls over or provide decremented SNE when there's * a retransmitted segment from before-rolling over. * - for request sockets such basis is rcv_isn/snt_isn, which seems * good enough as it's unexpected to receive 4 Gbytes on reqsk. * - for full sockets the basis is rcv_nxt/snd_una. snd_una is * taken instead of snd_nxt as currently it's easier to track * in tcp_snd_una_update(), rather than updating SNE in all * WRITE_ONCE(tp->snd_nxt, ...) * - for time-wait sockets the basis is tw_rcv_nxt/tw_snd_nxt. * tw_snd_nxt is not expected to change, while tw_rcv_nxt may. */ u32 snd_sne; u32 rcv_sne; refcount_t refcnt; /* Protects twsk destruction */ struct rcu_head rcu; }; #ifdef CONFIG_TCP_MD5SIG #include <linux/jump_label.h> extern struct static_key_false_deferred tcp_md5_needed; #define static_branch_tcp_md5() static_branch_unlikely(&tcp_md5_needed.key) #else #define static_branch_tcp_md5() false #endif #ifdef CONFIG_TCP_AO /* TCP-AO structures and functions */ #include <linux/jump_label.h> extern struct static_key_false_deferred tcp_ao_needed; #define static_branch_tcp_ao() static_branch_unlikely(&tcp_ao_needed.key) #else #define static_branch_tcp_ao() false #endif static inline bool tcp_hash_should_produce_warnings(void) { return static_branch_tcp_md5() || static_branch_tcp_ao(); } #define tcp_hash_fail(msg, family, skb, fmt, ...) \ do { \ const struct tcphdr *th = tcp_hdr(skb); \ char hdr_flags[6]; \ char *f = hdr_flags; \ \ if (!tcp_hash_should_produce_warnings()) \ break; \ if (th->fin) \ *f++ = 'F'; \ if (th->syn) \ *f++ = 'S'; \ if (th->rst) \ *f++ = 'R'; \ if (th->psh) \ *f++ = 'P'; \ if (th->ack) \ *f++ = '.'; \ *f = 0; \ if ((family) == AF_INET) { \ net_info_ratelimited("%s for %pI4.%d->%pI4.%d [%s] " fmt "\n", \ msg, &ip_hdr(skb)->saddr, ntohs(th->source), \ &ip_hdr(skb)->daddr, ntohs(th->dest), \ hdr_flags, ##__VA_ARGS__); \ } else { \ net_info_ratelimited("%s for [%pI6c].%d->[%pI6c].%d [%s]" fmt "\n", \ msg, &ipv6_hdr(skb)->saddr, ntohs(th->source), \ &ipv6_hdr(skb)->daddr, ntohs(th->dest), \ hdr_flags, ##__VA_ARGS__); \ } \ } while (0) #ifdef CONFIG_TCP_AO /* TCP-AO structures and functions */ struct tcp4_ao_context { __be32 saddr; __be32 daddr; __be16 sport; __be16 dport; __be32 sisn; __be32 disn; }; struct tcp6_ao_context { struct in6_addr saddr; struct in6_addr daddr; __be16 sport; __be16 dport; __be32 sisn; __be32 disn; }; struct tcp_sigpool; /* Established states are fast-path and there always is current_key/rnext_key */ #define TCP_AO_ESTABLISHED (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | \ TCPF_CLOSE_WAIT | TCPF_LAST_ACK | TCPF_CLOSING) int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, struct tcp_ao_key *key, struct tcphdr *th, __u8 *hash_location); int tcp_ao_hash_skb(unsigned short int family, char *ao_hash, struct tcp_ao_key *key, const struct sock *sk, const struct sk_buff *skb, const u8 *tkey, int hash_offset, u32 sne); int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family, sockptr_t optval, int optlen); struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao, int sndid, int rcvid); int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk, struct request_sock *req, struct sk_buff *skb, int family); int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx, unsigned int len, struct tcp_sigpool *hp); void tcp_ao_destroy_sock(struct sock *sk, bool twsk); void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp); bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code); int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen); int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen); int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen); int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen); enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, unsigned short int family, const struct request_sock *req, int l3index, const struct tcp_ao_hdr *aoh); u32 tcp_ao_compute_sne(u32 next_sne, u32 next_seq, u32 seq); struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, int l3index, const union tcp_ao_addr *addr, int family, int sndid, int rcvid); int tcp_ao_hash_hdr(unsigned short family, char *ao_hash, struct tcp_ao_key *key, const u8 *tkey, const union tcp_ao_addr *daddr, const union tcp_ao_addr *saddr, const struct tcphdr *th, u32 sne); int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb, const struct tcp_ao_hdr *aoh, int l3index, u32 seq, struct tcp_ao_key **key, char **traffic_key, bool *allocated_traffic_key, u8 *keyid, u32 *sne); /* ipv4 specific functions */ int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen); struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk, int sndid, int rcvid); int tcp_v4_ao_synack_hash(char *ao_hash, struct tcp_ao_key *mkt, struct request_sock *req, const struct sk_buff *skb, int hash_offset, u32 sne); int tcp_v4_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key, const struct sock *sk, __be32 sisn, __be32 disn, bool send); int tcp_v4_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key, struct request_sock *req); struct tcp_ao_key *tcp_v4_ao_lookup_rsk(const struct sock *sk, struct request_sock *req, int sndid, int rcvid); int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key, const struct sock *sk, const struct sk_buff *skb, const u8 *tkey, int hash_offset, u32 sne); /* ipv6 specific functions */ int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp, const struct in6_addr *daddr, const struct in6_addr *saddr, int nbytes); int tcp_v6_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key, const struct sk_buff *skb, __be32 sisn, __be32 disn); int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key, const struct sock *sk, __be32 sisn, __be32 disn, bool send); int tcp_v6_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key, struct request_sock *req); struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk, struct sock *addr_sk, int sndid, int rcvid); struct tcp_ao_key *tcp_v6_ao_lookup_rsk(const struct sock *sk, struct request_sock *req, int sndid, int rcvid); int tcp_v6_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key, const struct sock *sk, const struct sk_buff *skb, const u8 *tkey, int hash_offset, u32 sne); int tcp_v6_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen); int tcp_v6_ao_synack_hash(char *ao_hash, struct tcp_ao_key *ao_key, struct request_sock *req, const struct sk_buff *skb, int hash_offset, u32 sne); void tcp_ao_established(struct sock *sk); void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb); void tcp_ao_connect_init(struct sock *sk); void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, struct request_sock *req, unsigned short int family); #else /* CONFIG_TCP_AO */ static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, struct tcp_ao_key *key, struct tcphdr *th, __u8 *hash_location) { return 0; } static inline void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, struct request_sock *req, unsigned short int family) { } static inline bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code) { return false; } static inline enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, unsigned short int family, const struct request_sock *req, int l3index, const struct tcp_ao_hdr *aoh) { return SKB_NOT_DROPPED_YET; } static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, int l3index, const union tcp_ao_addr *addr, int family, int sndid, int rcvid) { return NULL; } static inline void tcp_ao_destroy_sock(struct sock *sk, bool twsk) { } static inline void tcp_ao_established(struct sock *sk) { } static inline void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb) { } static inline void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp) { } static inline void tcp_ao_connect_init(struct sock *sk) { } static inline int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen) { return -ENOPROTOOPT; } #endif #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) int tcp_do_parse_auth_options(const struct tcphdr *th, const u8 **md5_hash, const u8 **ao_hash); #else static inline int tcp_do_parse_auth_options(const struct tcphdr *th, const u8 **md5_hash, const u8 **ao_hash) { *md5_hash = NULL; *ao_hash = NULL; return 0; } #endif #endif /* _TCP_AO_H */
2 1 3 2 2 1 1 3 2 2 1 2 2 2 1 2 2 2 2 2 2 6 6 4 2 4 4 2 1 3 4 2 6 6 5 4 8 6 3 3 6 8 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992 Darren Senn */ /* These are all the functions necessary to implement itimers */ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/syscalls.h> #include <linux/time.h> #include <linux/sched/signal.h> #include <linux/sched/cputime.h> #include <linux/posix-timers.h> #include <linux/hrtimer.h> #include <trace/events/timer.h> #include <linux/compat.h> #include <linux/uaccess.h> /** * itimer_get_remtime - get remaining time for the timer * * @timer: the timer to read * * Returns the delta between the expiry time and now, which can be * less than zero or 1usec for an pending expired timer */ static struct timespec64 itimer_get_remtime(struct hrtimer *timer) { ktime_t rem = __hrtimer_get_remaining(timer, true); /* * Racy but safe: if the itimer expires after the above * hrtimer_get_remtime() call but before this condition * then we return 0 - which is correct. */ if (hrtimer_active(timer)) { if (rem <= 0) rem = NSEC_PER_USEC; } else rem = 0; return ktime_to_timespec64(rem); } static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, struct itimerspec64 *const value) { u64 val, interval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; spin_lock_irq(&tsk->sighand->siglock); val = it->expires; interval = it->incr; if (val) { u64 t, samples[CPUCLOCK_MAX]; thread_group_sample_cputime(tsk, samples); t = samples[clock_id]; if (val < t) /* about to fire */ val = TICK_NSEC; else val -= t; } spin_unlock_irq(&tsk->sighand->siglock); value->it_value = ns_to_timespec64(val); value->it_interval = ns_to_timespec64(interval); } static int do_getitimer(int which, struct itimerspec64 *value) { struct task_struct *tsk = current; switch (which) { case ITIMER_REAL: spin_lock_irq(&tsk->sighand->siglock); value->it_value = itimer_get_remtime(&tsk->signal->real_timer); value->it_interval = ktime_to_timespec64(tsk->signal->it_real_incr); spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: get_cpu_itimer(tsk, CPUCLOCK_VIRT, value); break; case ITIMER_PROF: get_cpu_itimer(tsk, CPUCLOCK_PROF, value); break; default: return(-EINVAL); } return 0; } static int put_itimerval(struct __kernel_old_itimerval __user *o, const struct itimerspec64 *i) { struct __kernel_old_itimerval v; v.it_interval.tv_sec = i->it_interval.tv_sec; v.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC; v.it_value.tv_sec = i->it_value.tv_sec; v.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC; return copy_to_user(o, &v, sizeof(struct __kernel_old_itimerval)) ? -EFAULT : 0; } SYSCALL_DEFINE2(getitimer, int, which, struct __kernel_old_itimerval __user *, value) { struct itimerspec64 get_buffer; int error = do_getitimer(which, &get_buffer); if (!error && put_itimerval(value, &get_buffer)) error = -EFAULT; return error; } #if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) struct old_itimerval32 { struct old_timeval32 it_interval; struct old_timeval32 it_value; }; static int put_old_itimerval32(struct old_itimerval32 __user *o, const struct itimerspec64 *i) { struct old_itimerval32 v32; v32.it_interval.tv_sec = i->it_interval.tv_sec; v32.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC; v32.it_value.tv_sec = i->it_value.tv_sec; v32.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC; return copy_to_user(o, &v32, sizeof(struct old_itimerval32)) ? -EFAULT : 0; } COMPAT_SYSCALL_DEFINE2(getitimer, int, which, struct old_itimerval32 __user *, value) { struct itimerspec64 get_buffer; int error = do_getitimer(which, &get_buffer); if (!error && put_old_itimerval32(value, &get_buffer)) error = -EFAULT; return error; } #endif /* * The timer is automagically restarted, when interval != 0 */ enum hrtimer_restart it_real_fn(struct hrtimer *timer) { struct signal_struct *sig = container_of(timer, struct signal_struct, real_timer); struct pid *leader_pid = sig->pids[PIDTYPE_TGID]; trace_itimer_expire(ITIMER_REAL, leader_pid, 0); kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid); return HRTIMER_NORESTART; } static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, const struct itimerspec64 *const value, struct itimerspec64 *const ovalue) { u64 oval, nval, ointerval, ninterval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; nval = timespec64_to_ns(&value->it_value); ninterval = timespec64_to_ns(&value->it_interval); spin_lock_irq(&tsk->sighand->siglock); oval = it->expires; ointerval = it->incr; if (oval || nval) { if (nval > 0) nval += TICK_NSEC; set_process_cpu_timer(tsk, clock_id, &nval, &oval); } it->expires = nval; it->incr = ninterval; trace_itimer_state(clock_id == CPUCLOCK_VIRT ? ITIMER_VIRTUAL : ITIMER_PROF, value, nval); spin_unlock_irq(&tsk->sighand->siglock); if (ovalue) { ovalue->it_value = ns_to_timespec64(oval); ovalue->it_interval = ns_to_timespec64(ointerval); } } /* * Returns true if the timeval is in canonical form */ #define timeval_valid(t) \ (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) static int do_setitimer(int which, struct itimerspec64 *value, struct itimerspec64 *ovalue) { struct task_struct *tsk = current; struct hrtimer *timer; ktime_t expires; switch (which) { case ITIMER_REAL: again: spin_lock_irq(&tsk->sighand->siglock); timer = &tsk->signal->real_timer; if (ovalue) { ovalue->it_value = itimer_get_remtime(timer); ovalue->it_interval = ktime_to_timespec64(tsk->signal->it_real_incr); } /* We are sharing ->siglock with it_real_fn() */ if (hrtimer_try_to_cancel(timer) < 0) { spin_unlock_irq(&tsk->sighand->siglock); hrtimer_cancel_wait_running(timer); goto again; } expires = timespec64_to_ktime(value->it_value); if (expires != 0) { tsk->signal->it_real_incr = timespec64_to_ktime(value->it_interval); hrtimer_start(timer, expires, HRTIMER_MODE_REL); } else tsk->signal->it_real_incr = 0; trace_itimer_state(ITIMER_REAL, value, 0); spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue); break; case ITIMER_PROF: set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue); break; default: return -EINVAL; } return 0; } #ifdef CONFIG_SECURITY_SELINUX void clear_itimer(void) { struct itimerspec64 v = {}; int i; for (i = 0; i < 3; i++) do_setitimer(i, &v, NULL); } #endif #ifdef __ARCH_WANT_SYS_ALARM /** * alarm_setitimer - set alarm in seconds * * @seconds: number of seconds until alarm * 0 disables the alarm * * Returns the remaining time in seconds of a pending timer or 0 when * the timer is not active. * * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid * negative timeval settings which would cause immediate expiry. */ static unsigned int alarm_setitimer(unsigned int seconds) { struct itimerspec64 it_new, it_old; #if BITS_PER_LONG < 64 if (seconds > INT_MAX) seconds = INT_MAX; #endif it_new.it_value.tv_sec = seconds; it_new.it_value.tv_nsec = 0; it_new.it_interval.tv_sec = it_new.it_interval.tv_nsec = 0; do_setitimer(ITIMER_REAL, &it_new, &it_old); /* * We can't return 0 if we have an alarm pending ... And we'd * better return too much than too little anyway */ if ((!it_old.it_value.tv_sec && it_old.it_value.tv_nsec) || it_old.it_value.tv_nsec >= (NSEC_PER_SEC / 2)) it_old.it_value.tv_sec++; return it_old.it_value.tv_sec; } /* * For backwards compatibility? This can be done in libc so Alpha * and all newer ports shouldn't need it. */ SYSCALL_DEFINE1(alarm, unsigned int, seconds) { return alarm_setitimer(seconds); } #endif static int get_itimerval(struct itimerspec64 *o, const struct __kernel_old_itimerval __user *i) { struct __kernel_old_itimerval v; if (copy_from_user(&v, i, sizeof(struct __kernel_old_itimerval))) return -EFAULT; /* Validate the timevals in value. */ if (!timeval_valid(&v.it_value) || !timeval_valid(&v.it_interval)) return -EINVAL; o->it_interval.tv_sec = v.it_interval.tv_sec; o->it_interval.tv_nsec = v.it_interval.tv_usec * NSEC_PER_USEC; o->it_value.tv_sec = v.it_value.tv_sec; o->it_value.tv_nsec = v.it_value.tv_usec * NSEC_PER_USEC; return 0; } SYSCALL_DEFINE3(setitimer, int, which, struct __kernel_old_itimerval __user *, value, struct __kernel_old_itimerval __user *, ovalue) { struct itimerspec64 set_buffer, get_buffer; int error; if (value) { error = get_itimerval(&set_buffer, value); if (error) return error; } else { memset(&set_buffer, 0, sizeof(set_buffer)); printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." " Misfeature support will be removed\n", current->comm); } error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); if (error || !ovalue) return error; if (put_itimerval(ovalue, &get_buffer)) return -EFAULT; return 0; } #if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) static int get_old_itimerval32(struct itimerspec64 *o, const struct old_itimerval32 __user *i) { struct old_itimerval32 v32; if (copy_from_user(&v32, i, sizeof(struct old_itimerval32))) return -EFAULT; /* Validate the timevals in value. */ if (!timeval_valid(&v32.it_value) || !timeval_valid(&v32.it_interval)) return -EINVAL; o->it_interval.tv_sec = v32.it_interval.tv_sec; o->it_interval.tv_nsec = v32.it_interval.tv_usec * NSEC_PER_USEC; o->it_value.tv_sec = v32.it_value.tv_sec; o->it_value.tv_nsec = v32.it_value.tv_usec * NSEC_PER_USEC; return 0; } COMPAT_SYSCALL_DEFINE3(setitimer, int, which, struct old_itimerval32 __user *, value, struct old_itimerval32 __user *, ovalue) { struct itimerspec64 set_buffer, get_buffer; int error; if (value) { error = get_old_itimerval32(&set_buffer, value); if (error) return error; } else { memset(&set_buffer, 0, sizeof(set_buffer)); printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." " Misfeature support will be removed\n", current->comm); } error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); if (error || !ovalue) return error; if (put_old_itimerval32(ovalue, &get_buffer)) return -EFAULT; return 0; } #endif
8 8 8 8 8 8 8 7 8 8 7 8 8 8 8 8 8 8 8 8 8 8 8 8 7 8 7 8 8 8 7 7 7 7 7 1 1 8 1 1 1 1 8 1 6 3 7 8 3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 // SPDX-License-Identifier: GPL-2.0-only /* * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. * * Rewritten and vastly simplified by Rusty Russell for in-kernel * module loader: * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation * * ChangeLog: * * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com> * Changed the compression method from stem compression to "table lookup" * compression (see scripts/kallsyms.c for a more complete description) */ #include <linux/kallsyms.h> #include <linux/init.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/kdb.h> #include <linux/err.h> #include <linux/proc_fs.h> #include <linux/sched.h> /* for cond_resched */ #include <linux/ctype.h> #include <linux/slab.h> #include <linux/filter.h> #include <linux/ftrace.h> #include <linux/kprobes.h> #include <linux/build_bug.h> #include <linux/compiler.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/bsearch.h> #include <linux/btf_ids.h> #include "kallsyms_internal.h" /* * Expand a compressed symbol data into the resulting uncompressed string, * if uncompressed string is too long (>= maxlen), it will be truncated, * given the offset to where the symbol is in the compressed stream. */ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result, size_t maxlen) { int len, skipped_first = 0; const char *tptr; const u8 *data; /* Get the compressed symbol length from the first symbol byte. */ data = &kallsyms_names[off]; len = *data; data++; off++; /* If MSB is 1, it is a "big" symbol, so needs an additional byte. */ if ((len & 0x80) != 0) { len = (len & 0x7F) | (*data << 7); data++; off++; } /* * Update the offset to return the offset for the next symbol on * the compressed stream. */ off += len; /* * For every byte on the compressed symbol data, copy the table * entry for that byte. */ while (len) { tptr = &kallsyms_token_table[kallsyms_token_index[*data]]; data++; len--; while (*tptr) { if (skipped_first) { if (maxlen <= 1) goto tail; *result = *tptr; result++; maxlen--; } else skipped_first = 1; tptr++; } } tail: if (maxlen) *result = '\0'; /* Return to offset to the next symbol. */ return off; } /* * Get symbol type information. This is encoded as a single char at the * beginning of the symbol name. */ static char kallsyms_get_symbol_type(unsigned int off) { /* * Get just the first code, look it up in the token table, * and return the first char from this token. */ return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]]; } /* * Find the offset on the compressed stream given and index in the * kallsyms array. */ static unsigned int get_symbol_offset(unsigned long pos) { const u8 *name; int i, len; /* * Use the closest marker we have. We have markers every 256 positions, * so that should be close enough. */ name = &kallsyms_names[kallsyms_markers[pos >> 8]]; /* * Sequentially scan all the symbols up to the point we're searching * for. Every symbol is stored in a [<len>][<len> bytes of data] format, * so we just need to add the len to the current pointer for every * symbol we wish to skip. */ for (i = 0; i < (pos & 0xFF); i++) { len = *name; /* * If MSB is 1, it is a "big" symbol, so we need to look into * the next byte (and skip it, too). */ if ((len & 0x80) != 0) len = ((len & 0x7F) | (name[1] << 7)) + 1; name = name + len + 1; } return name - kallsyms_names; } unsigned long kallsyms_sym_address(int idx) { if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) return kallsyms_addresses[idx]; /* values are unsigned offsets if --absolute-percpu is not in effect */ if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; /* ...otherwise, positive offsets are absolute values */ if (kallsyms_offsets[idx] >= 0) return kallsyms_offsets[idx]; /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; } static void cleanup_symbol_name(char *s) { char *res; if (!IS_ENABLED(CONFIG_LTO_CLANG)) return; /* * LLVM appends various suffixes for local functions and variables that * must be promoted to global scope as part of LTO. This can break * hooking of static functions with kprobes. '.' is not a valid * character in an identifier in C. Suffixes only in LLVM LTO observed: * - foo.llvm.[0-9a-f]+ */ res = strstr(s, ".llvm."); if (res) *res = '\0'; return; } static int compare_symbol_name(const char *name, char *namebuf) { /* The kallsyms_seqs_of_names is sorted based on names after * cleanup_symbol_name() (see scripts/kallsyms.c) if clang lto is enabled. * To ensure correct bisection in kallsyms_lookup_names(), do * cleanup_symbol_name(namebuf) before comparing name and namebuf. */ cleanup_symbol_name(namebuf); return strcmp(name, namebuf); } static unsigned int get_symbol_seq(int index) { unsigned int i, seq = 0; for (i = 0; i < 3; i++) seq = (seq << 8) | kallsyms_seqs_of_names[3 * index + i]; return seq; } static int kallsyms_lookup_names(const char *name, unsigned int *start, unsigned int *end) { int ret; int low, mid, high; unsigned int seq, off; char namebuf[KSYM_NAME_LEN]; low = 0; high = kallsyms_num_syms - 1; while (low <= high) { mid = low + (high - low) / 2; seq = get_symbol_seq(mid); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); ret = compare_symbol_name(name, namebuf); if (ret > 0) low = mid + 1; else if (ret < 0) high = mid - 1; else break; } if (low > high) return -ESRCH; low = mid; while (low) { seq = get_symbol_seq(low - 1); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (compare_symbol_name(name, namebuf)) break; low--; } *start = low; if (end) { high = mid; while (high < kallsyms_num_syms - 1) { seq = get_symbol_seq(high + 1); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (compare_symbol_name(name, namebuf)) break; high++; } *end = high; } return 0; } /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { int ret; unsigned int i; /* Skip the search for empty string. */ if (!*name) return 0; ret = kallsyms_lookup_names(name, &i, NULL); if (!ret) return kallsyms_sym_address(get_symbol_seq(i)); return module_kallsyms_lookup_name(name); } /* * Iterate over all symbols in vmlinux. For symbols from modules use * module_kallsyms_on_each_symbol instead. */ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long), void *data) { char namebuf[KSYM_NAME_LEN]; unsigned long i; unsigned int off; int ret; for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); ret = fn(data, namebuf, kallsyms_sym_address(i)); if (ret != 0) return ret; cond_resched(); } return 0; } int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), const char *name, void *data) { int ret; unsigned int i, start, end; ret = kallsyms_lookup_names(name, &start, &end); if (ret) return 0; for (i = start; !ret && i <= end; i++) { ret = fn(data, kallsyms_sym_address(get_symbol_seq(i))); cond_resched(); } return ret; } static unsigned long get_symbol_pos(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { unsigned long symbol_start = 0, symbol_end = 0; unsigned long i, low, high, mid; /* Do a binary search on the sorted kallsyms_addresses array. */ low = 0; high = kallsyms_num_syms; while (high - low > 1) { mid = low + (high - low) / 2; if (kallsyms_sym_address(mid) <= addr) low = mid; else high = mid; } /* * Search for the first aliased symbol. Aliased * symbols are symbols with the same address. */ while (low && kallsyms_sym_address(low-1) == kallsyms_sym_address(low)) --low; symbol_start = kallsyms_sym_address(low); /* Search for next non-aliased symbol. */ for (i = low + 1; i < kallsyms_num_syms; i++) { if (kallsyms_sym_address(i) > symbol_start) { symbol_end = kallsyms_sym_address(i); break; } } /* If we found no next symbol, we use the end of the section. */ if (!symbol_end) { if (is_kernel_inittext(addr)) symbol_end = (unsigned long)_einittext; else if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) symbol_end = (unsigned long)_end; else symbol_end = (unsigned long)_etext; } if (symbolsize) *symbolsize = symbol_end - symbol_start; if (offset) *offset = addr - symbol_start; return low; } /* * Lookup an address but don't bother to find any names. */ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { char namebuf[KSYM_NAME_LEN]; if (is_ksym_addr(addr)) { get_symbol_pos(addr, symbolsize, offset); return 1; } return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) || !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } static int kallsyms_lookup_buildid(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf) { int ret; namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; if (is_ksym_addr(addr)) { unsigned long pos; pos = get_symbol_pos(addr, symbolsize, offset); /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), namebuf, KSYM_NAME_LEN); if (modname) *modname = NULL; if (modbuildid) *modbuildid = NULL; ret = strlen(namebuf); goto found; } /* See if it's in a module or a BPF JITed image. */ ret = module_address_lookup(addr, symbolsize, offset, modname, modbuildid, namebuf); if (!ret) ret = bpf_address_lookup(addr, symbolsize, offset, modname, namebuf); if (!ret) ret = ftrace_mod_address_lookup(addr, symbolsize, offset, modname, namebuf); found: cleanup_symbol_name(namebuf); return ret; } /* * Lookup an address * - modname is set to NULL if it's in the kernel. * - We guarantee that the returned name is valid until we reschedule even if. * It resides in a module. * - We also guarantee that modname will be valid until rescheduled. */ const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf) { int ret = kallsyms_lookup_buildid(addr, symbolsize, offset, modname, NULL, namebuf); if (!ret) return NULL; return namebuf; } int lookup_symbol_name(unsigned long addr, char *symname) { int res; symname[0] = '\0'; symname[KSYM_NAME_LEN - 1] = '\0'; if (is_ksym_addr(addr)) { unsigned long pos; pos = get_symbol_pos(addr, NULL, NULL); /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), symname, KSYM_NAME_LEN); goto found; } /* See if it's in a module. */ res = lookup_module_symbol_name(addr, symname); if (res) return res; found: cleanup_symbol_name(symname); return 0; } /* Look up a kernel symbol and return it in a text buffer. */ static int __sprint_symbol(char *buffer, unsigned long address, int symbol_offset, int add_offset, int add_buildid) { char *modname; const unsigned char *buildid; unsigned long offset, size; int len; address += symbol_offset; len = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid, buffer); if (!len) return sprintf(buffer, "0x%lx", address - symbol_offset); offset -= symbol_offset; if (add_offset) len += sprintf(buffer + len, "+%#lx/%#lx", offset, size); if (modname) { len += sprintf(buffer + len, " [%s", modname); #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) if (add_buildid && buildid) { /* build ID should match length of sprintf */ #if IS_ENABLED(CONFIG_MODULES) static_assert(sizeof(typeof_member(struct module, build_id)) == 20); #endif len += sprintf(buffer + len, " %20phN", buildid); } #endif len += sprintf(buffer + len, "]"); } return len; } /** * sprint_symbol - Look up a kernel symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function looks up a kernel symbol with @address and stores its name, * offset, size and module name to @buffer if possible. If no symbol was found, * just saves its @address as is. * * This function returns the number of bytes stored in @buffer. */ int sprint_symbol(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, 0, 1, 0); } EXPORT_SYMBOL_GPL(sprint_symbol); /** * sprint_symbol_build_id - Look up a kernel symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function looks up a kernel symbol with @address and stores its name, * offset, size, module name and module build ID to @buffer if possible. If no * symbol was found, just saves its @address as is. * * This function returns the number of bytes stored in @buffer. */ int sprint_symbol_build_id(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, 0, 1, 1); } EXPORT_SYMBOL_GPL(sprint_symbol_build_id); /** * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function looks up a kernel symbol with @address and stores its name * and module name to @buffer if possible. If no symbol was found, just saves * its @address as is. * * This function returns the number of bytes stored in @buffer. */ int sprint_symbol_no_offset(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, 0, 0, 0); } EXPORT_SYMBOL_GPL(sprint_symbol_no_offset); /** * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function is for stack backtrace and does the same thing as * sprint_symbol() but with modified/decreased @address. If there is a * tail-call to the function marked "noreturn", gcc optimized out code after * the call so that the stack-saved return address could point outside of the * caller. This function ensures that kallsyms will find the original caller * by decreasing @address. * * This function returns the number of bytes stored in @buffer. */ int sprint_backtrace(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, -1, 1, 0); } /** * sprint_backtrace_build_id - Look up a backtrace symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function is for stack backtrace and does the same thing as * sprint_symbol() but with modified/decreased @address. If there is a * tail-call to the function marked "noreturn", gcc optimized out code after * the call so that the stack-saved return address could point outside of the * caller. This function ensures that kallsyms will find the original caller * by decreasing @address. This function also appends the module build ID to * the @buffer if @address is within a kernel module. * * This function returns the number of bytes stored in @buffer. */ int sprint_backtrace_build_id(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, -1, 1, 1); } /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; loff_t pos_mod_end; loff_t pos_ftrace_mod_end; loff_t pos_bpf_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; char name[KSYM_NAME_LEN]; char module_name[MODULE_NAME_LEN]; int exported; int show_value; }; static int get_ksymbol_mod(struct kallsym_iter *iter) { int ret = module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, &iter->type, iter->name, iter->module_name, &iter->exported); if (ret < 0) { iter->pos_mod_end = iter->pos; return 0; } return 1; } /* * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace * purposes. In that case "__builtin__ftrace" is used as a module name, even * though "__builtin__ftrace" is not a module. */ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) { int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end, &iter->value, &iter->type, iter->name, iter->module_name, &iter->exported); if (ret < 0) { iter->pos_ftrace_mod_end = iter->pos; return 0; } return 1; } static int get_ksymbol_bpf(struct kallsym_iter *iter) { int ret; strscpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, &iter->value, &iter->type, iter->name); if (ret < 0) { iter->pos_bpf_end = iter->pos; return 0; } return 1; } /* * This uses "__builtin__kprobes" as a module name for symbols for pages * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a * module. */ static int get_ksymbol_kprobe(struct kallsym_iter *iter) { strscpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN); iter->exported = 0; return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end, &iter->value, &iter->type, iter->name) < 0 ? 0 : 1; } /* Returns space to next name. */ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) { unsigned off = iter->nameoff; iter->module_name[0] = '\0'; iter->value = kallsyms_sym_address(iter->pos); iter->type = kallsyms_get_symbol_type(off); off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name)); return off - iter->nameoff; } static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) { iter->name[0] = '\0'; iter->nameoff = get_symbol_offset(new_pos); iter->pos = new_pos; if (new_pos == 0) { iter->pos_mod_end = 0; iter->pos_ftrace_mod_end = 0; iter->pos_bpf_end = 0; } } /* * The end position (last + 1) of each additional kallsyms section is recorded * in iter->pos_..._end as each section is added, and so can be used to * determine which get_ksymbol_...() function to call next. */ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) { iter->pos = pos; if ((!iter->pos_mod_end || iter->pos_mod_end > pos) && get_ksymbol_mod(iter)) return 1; if ((!iter->pos_ftrace_mod_end || iter->pos_ftrace_mod_end > pos) && get_ksymbol_ftrace_mod(iter)) return 1; if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) && get_ksymbol_bpf(iter)) return 1; return get_ksymbol_kprobe(iter); } /* Returns false if pos at or past end of file. */ static int update_iter(struct kallsym_iter *iter, loff_t pos) { /* Module symbols can be accessed randomly. */ if (pos >= kallsyms_num_syms) return update_iter_mod(iter, pos); /* If we're not on the desired position, reset to new position. */ if (pos != iter->pos) reset_iter(iter, pos); iter->nameoff += get_ksymbol_core(iter); iter->pos++; return 1; } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { (*pos)++; if (!update_iter(m->private, *pos)) return NULL; return p; } static void *s_start(struct seq_file *m, loff_t *pos) { if (!update_iter(m->private, *pos)) return NULL; return m->private; } static void s_stop(struct seq_file *m, void *p) { } static int s_show(struct seq_file *m, void *p) { void *value; struct kallsym_iter *iter = m->private; /* Some debugging symbols have no name. Ignore them. */ if (!iter->name[0]) return 0; value = iter->show_value ? (void *)iter->value : NULL; if (iter->module_name[0]) { char type; /* * Label it "global" if it is exported, * "local" if not exported. */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); seq_printf(m, "%px %c %s\t[%s]\n", value, type, iter->name, iter->module_name); } else seq_printf(m, "%px %c %s\n", value, iter->type, iter->name); return 0; } static const struct seq_operations kallsyms_op = { .start = s_start, .next = s_next, .stop = s_stop, .show = s_show }; #ifdef CONFIG_BPF_SYSCALL struct bpf_iter__ksym { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct kallsym_iter *, ksym); }; static int ksym_prog_seq_show(struct seq_file *m, bool in_stop) { struct bpf_iter__ksym ctx; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = m; prog = bpf_iter_get_info(&meta, in_stop); if (!prog) return 0; ctx.meta = &meta; ctx.ksym = m ? m->private : NULL; return bpf_iter_run_prog(prog, &ctx); } static int bpf_iter_ksym_seq_show(struct seq_file *m, void *p) { return ksym_prog_seq_show(m, false); } static void bpf_iter_ksym_seq_stop(struct seq_file *m, void *p) { if (!p) (void) ksym_prog_seq_show(m, true); else s_stop(m, p); } static const struct seq_operations bpf_iter_ksym_ops = { .start = s_start, .next = s_next, .stop = bpf_iter_ksym_seq_stop, .show = bpf_iter_ksym_seq_show, }; static int bpf_iter_ksym_init(void *priv_data, struct bpf_iter_aux_info *aux) { struct kallsym_iter *iter = priv_data; reset_iter(iter, 0); /* cache here as in kallsyms_open() case; use current process * credentials to tell BPF iterators if values should be shown. */ iter->show_value = kallsyms_show_value(current_cred()); return 0; } DEFINE_BPF_ITER_FUNC(ksym, struct bpf_iter_meta *meta, struct kallsym_iter *ksym) static const struct bpf_iter_seq_info ksym_iter_seq_info = { .seq_ops = &bpf_iter_ksym_ops, .init_seq_private = bpf_iter_ksym_init, .fini_seq_private = NULL, .seq_priv_size = sizeof(struct kallsym_iter), }; static struct bpf_iter_reg ksym_iter_reg_info = { .target = "ksym", .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__ksym, ksym), PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &ksym_iter_seq_info, }; BTF_ID_LIST(btf_ksym_iter_id) BTF_ID(struct, kallsym_iter) static int __init bpf_ksym_iter_register(void) { ksym_iter_reg_info.ctx_arg_info[0].btf_id = *btf_ksym_iter_id; return bpf_iter_reg_target(&ksym_iter_reg_info); } late_initcall(bpf_ksym_iter_register); #endif /* CONFIG_BPF_SYSCALL */ static int kallsyms_open(struct inode *inode, struct file *file) { /* * We keep iterator in m->private, since normal case is to * s_start from where we left off, so we avoid doing * using get_symbol_offset for every symbol. */ struct kallsym_iter *iter; iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter)); if (!iter) return -ENOMEM; reset_iter(iter, 0); /* * Instead of checking this on every s_show() call, cache * the result here at open time. */ iter->show_value = kallsyms_show_value(file->f_cred); return 0; } #ifdef CONFIG_KGDB_KDB const char *kdb_walk_kallsyms(loff_t *pos) { static struct kallsym_iter kdb_walk_kallsyms_iter; if (*pos == 0) { memset(&kdb_walk_kallsyms_iter, 0, sizeof(kdb_walk_kallsyms_iter)); reset_iter(&kdb_walk_kallsyms_iter, 0); } while (1) { if (!update_iter(&kdb_walk_kallsyms_iter, *pos)) return NULL; ++*pos; /* Some debugging symbols have no name. Ignore them. */ if (kdb_walk_kallsyms_iter.name[0]) return kdb_walk_kallsyms_iter.name; } } #endif /* CONFIG_KGDB_KDB */ static const struct proc_ops kallsyms_proc_ops = { .proc_open = kallsyms_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release_private, }; static int __init kallsyms_init(void) { proc_create("kallsyms", 0444, NULL, &kallsyms_proc_ops); return 0; } device_initcall(kallsyms_init);
92 467 3658 1516 2 512 2471 955 950 950 949 955 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 // SPDX-License-Identifier: GPL-2.0-or-later /* bit search implementation * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * Copyright (C) 2008 IBM Corporation * 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au> * (Inspired by David Howell's find_next_bit implementation) * * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease * size and improve performance, 2015. */ #include <linux/bitops.h> #include <linux/bitmap.h> #include <linux/export.h> #include <linux/math.h> #include <linux/minmax.h> #include <linux/swab.h> /* * Common helper for find_bit() function family * @FETCH: The expression that fetches and pre-processes each word of bitmap(s) * @MUNGE: The expression that post-processes a word containing found bit (may be empty) * @size: The bitmap size in bits */ #define FIND_FIRST_BIT(FETCH, MUNGE, size) \ ({ \ unsigned long idx, val, sz = (size); \ \ for (idx = 0; idx * BITS_PER_LONG < sz; idx++) { \ val = (FETCH); \ if (val) { \ sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(val)), sz); \ break; \ } \ } \ \ sz; \ }) /* * Common helper for find_next_bit() function family * @FETCH: The expression that fetches and pre-processes each word of bitmap(s) * @MUNGE: The expression that post-processes a word containing found bit (may be empty) * @size: The bitmap size in bits * @start: The bitnumber to start searching at */ #define FIND_NEXT_BIT(FETCH, MUNGE, size, start) \ ({ \ unsigned long mask, idx, tmp, sz = (size), __start = (start); \ \ if (unlikely(__start >= sz)) \ goto out; \ \ mask = MUNGE(BITMAP_FIRST_WORD_MASK(__start)); \ idx = __start / BITS_PER_LONG; \ \ for (tmp = (FETCH) & mask; !tmp; tmp = (FETCH)) { \ if ((idx + 1) * BITS_PER_LONG >= sz) \ goto out; \ idx++; \ } \ \ sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(tmp)), sz); \ out: \ sz; \ }) #define FIND_NTH_BIT(FETCH, size, num) \ ({ \ unsigned long sz = (size), nr = (num), idx, w, tmp; \ \ for (idx = 0; (idx + 1) * BITS_PER_LONG <= sz; idx++) { \ if (idx * BITS_PER_LONG + nr >= sz) \ goto out; \ \ tmp = (FETCH); \ w = hweight_long(tmp); \ if (w > nr) \ goto found; \ \ nr -= w; \ } \ \ if (sz % BITS_PER_LONG) \ tmp = (FETCH) & BITMAP_LAST_WORD_MASK(sz); \ found: \ sz = idx * BITS_PER_LONG + fns(tmp, nr); \ out: \ sz; \ }) #ifndef find_first_bit /* * Find the first set bit in a memory region. */ unsigned long _find_first_bit(const unsigned long *addr, unsigned long size) { return FIND_FIRST_BIT(addr[idx], /* nop */, size); } EXPORT_SYMBOL(_find_first_bit); #endif #ifndef find_first_and_bit /* * Find the first set bit in two memory regions. */ unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) { return FIND_FIRST_BIT(addr1[idx] & addr2[idx], /* nop */, size); } EXPORT_SYMBOL(_find_first_and_bit); #endif /* * Find the first set bit in three memory regions. */ unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size) { return FIND_FIRST_BIT(addr1[idx] & addr2[idx] & addr3[idx], /* nop */, size); } EXPORT_SYMBOL(_find_first_and_and_bit); #ifndef find_first_zero_bit /* * Find the first cleared bit in a memory region. */ unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size) { return FIND_FIRST_BIT(~addr[idx], /* nop */, size); } EXPORT_SYMBOL(_find_first_zero_bit); #endif #ifndef find_next_bit unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, unsigned long start) { return FIND_NEXT_BIT(addr[idx], /* nop */, nbits, start); } EXPORT_SYMBOL(_find_next_bit); #endif unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) { return FIND_NTH_BIT(addr[idx], size, n); } EXPORT_SYMBOL(__find_nth_bit); unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { return FIND_NTH_BIT(addr1[idx] & addr2[idx], size, n); } EXPORT_SYMBOL(__find_nth_and_bit); unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { return FIND_NTH_BIT(addr1[idx] & ~addr2[idx], size, n); } EXPORT_SYMBOL(__find_nth_andnot_bit); unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n) { return FIND_NTH_BIT(addr1[idx] & addr2[idx] & ~addr3[idx], size, n); } EXPORT_SYMBOL(__find_nth_and_andnot_bit); #ifndef find_next_and_bit unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start) { return FIND_NEXT_BIT(addr1[idx] & addr2[idx], /* nop */, nbits, start); } EXPORT_SYMBOL(_find_next_and_bit); #endif #ifndef find_next_andnot_bit unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start) { return FIND_NEXT_BIT(addr1[idx] & ~addr2[idx], /* nop */, nbits, start); } EXPORT_SYMBOL(_find_next_andnot_bit); #endif #ifndef find_next_or_bit unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start) { return FIND_NEXT_BIT(addr1[idx] | addr2[idx], /* nop */, nbits, start); } EXPORT_SYMBOL(_find_next_or_bit); #endif #ifndef find_next_zero_bit unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start) { return FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start); } EXPORT_SYMBOL(_find_next_zero_bit); #endif #ifndef find_last_bit unsigned long _find_last_bit(const unsigned long *addr, unsigned long size) { if (size) { unsigned long val = BITMAP_LAST_WORD_MASK(size); unsigned long idx = (size-1) / BITS_PER_LONG; do { val &= addr[idx]; if (val) return idx * BITS_PER_LONG + __fls(val); val = ~0ul; } while (idx--); } return size; } EXPORT_SYMBOL(_find_last_bit); #endif unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr, unsigned long size, unsigned long offset) { offset = find_next_bit(addr, size, offset); if (offset == size) return size; offset = round_down(offset, 8); *clump = bitmap_get_value8(addr, offset); return offset; } EXPORT_SYMBOL(find_next_clump8); #ifdef __BIG_ENDIAN #ifndef find_first_zero_bit_le /* * Find the first cleared bit in an LE memory region. */ unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size) { return FIND_FIRST_BIT(~addr[idx], swab, size); } EXPORT_SYMBOL(_find_first_zero_bit_le); #endif #ifndef find_next_zero_bit_le unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset) { return FIND_NEXT_BIT(~addr[idx], swab, size, offset); } EXPORT_SYMBOL(_find_next_zero_bit_le); #endif #ifndef find_next_bit_le unsigned long _find_next_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset) { return FIND_NEXT_BIT(addr[idx], swab, size, offset); } EXPORT_SYMBOL(_find_next_bit_le); #endif #endif /* __BIG_ENDIAN */
75 75 75 38 38 2 2 2 2 38 38 30 30 30 30 30 1 2 1 35 3 35 30 30 35 35 35 23 35 5 5 4 5 9 9 9 9 9 9 9 9 9 32 37 32 37 25 25 25 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic INET6 transport hashtables * * Authors: Lotsa people, from code originally in tcp, generalised here * by Arnaldo Carvalho de Melo <acme@mandriva.com> */ #include <linux/module.h> #include <linux/random.h> #include <net/addrconf.h> #include <net/hotdata.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> #include <net/sock_reuseport.h> #include <net/tcp.h> u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport) { u32 lhash, fhash; net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret)); net_get_random_once(&tcp_ipv6_hash_secret, sizeof(tcp_ipv6_hash_secret)); lhash = (__force u32)laddr->s6_addr32[3]; fhash = __ipv6_addr_jhash(faddr, tcp_ipv6_hash_secret); return __inet6_ehashfn(lhash, lport, fhash, fport, inet6_ehash_secret + net_hash_mix(net)); } EXPORT_SYMBOL_GPL(inet6_ehashfn); /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM * * The sockhash lock must be held as a reader here. */ struct sock *__inet6_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, const int dif, const int sdif) { struct sock *sk; const struct hlist_nulls_node *node; const __portpair ports = INET_COMBINED_PORTS(sport, hnum); /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); unsigned int slot = hash & hashinfo->ehash_mask; struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) continue; if (!inet6_match(net, sk, saddr, daddr, ports, dif, sdif)) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; if (unlikely(!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))) { sock_gen_put(sk); goto begin; } goto found; } if (get_nulls_value(node) != slot) goto begin; out: sk = NULL; found: return sk; } EXPORT_SYMBOL(__inet6_lookup_established); static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const struct in6_addr *daddr, const int dif, const int sdif) { int score = -1; if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && sk->sk_family == PF_INET6) { if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return -1; if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; score = sk->sk_bound_dev_if ? 2 : 1; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; } /** * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary. * @net: network namespace. * @sk: AF_INET6 socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. * @skb: context for a potential SK_REUSEPORT program. * @doff: header offset. * @saddr: source address. * @sport: source port. * @daddr: destination address. * @hnum: destination port in host byte order. * @ehashfn: hash function used to generate the fallback hash. * * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to * the selected sock or an error. */ struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, inet6_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { phash = INDIRECT_CALL_INET(ehashfn, udp6_ehashfn, inet6_ehashfn, net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; } EXPORT_SYMBOL_GPL(inet6_lookup_reuseport); /* called with rcu_read_lock() */ static struct sock *inet6_lhash2_lookup(struct net *net, struct inet_listen_hashbucket *ilb2, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif) { struct sock *sk, *result = NULL; struct hlist_nulls_node *node; int score, hiscore = 0; sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, inet6_ehashfn); if (result) return result; result = sk; hiscore = score; } } return result; } struct sock *inet6_lookup_run_sk_lookup(struct net *net, int protocol, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, const int dif, inet6_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool no_reuseport; no_reuseport = bpf_sk_lookup_run_v6(net, protocol, saddr, sport, daddr, hnum, dif, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, ehashfn); if (reuse_sk) sk = reuse_sk; return sk; } EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup); struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; struct sock *result = NULL; unsigned int hash2; /* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled) && hashinfo == net->ipv4.tcp_death_row.hashinfo) { result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, saddr, sport, daddr, hnum, dif, inet6_ehashfn); if (result) goto done; } hash2 = ipv6_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet6_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, daddr, hnum, dif, sdif); if (result) goto done; /* Lookup lhash2 with in6addr_any */ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet6_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, &in6addr_any, hnum, dif, sdif); done: if (IS_ERR(result)) return NULL; return result; } EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif) { struct sock *sk; bool refcounted; sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, ntohs(dport), dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; } EXPORT_SYMBOL_GPL(inet6_lookup); static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, const __u16 lport, struct inet_timewait_sock **twp) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr; const struct in6_addr *saddr = &sk->sk_v6_daddr; const int dif = sk->sk_bound_dev_if; struct net *net = sock_net(sk); const int sdif = l3mdev_master_ifindex_by_index(net, dif); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); spinlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { if (sk2->sk_hash != hash) continue; if (likely(inet6_match(net, sk2, saddr, daddr, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); if (sk->sk_protocol == IPPROTO_TCP && tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; } } /* Must record num and sport now. Otherwise we will see * in hash table socket with a funny identity. */ inet->inet_num = lport; inet->inet_sport = htons(lport); sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule_put(tw); } return 0; not_unique: spin_unlock(lock); return -EADDRNOTAVAIL; } static u64 inet6_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); return secure_ipv6_port_ephemeral(sk->sk_v6_rcv_saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32, inet->inet_dport); } int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { u64 port_offset = 0; if (!inet_sk(sk)->inet_num) port_offset = inet6_sk_port_offset(sk); return __inet_hash_connect(death_row, sk, port_offset, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); int inet6_hash(struct sock *sk) { int err = 0; if (sk->sk_state != TCP_CLOSE) err = __inet_hash(sk, NULL); return err; } EXPORT_SYMBOL_GPL(inet6_hash);
9 9 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 /* SPDX-License-Identifier: GPL-2.0 */ /* Rewritten and vastly simplified by Rusty Russell for in-kernel * module loader: * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation */ #ifndef _LINUX_KALLSYMS_H #define _LINUX_KALLSYMS_H #include <linux/errno.h> #include <linux/buildid.h> #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/mm.h> #include <linux/module.h> #include <asm/sections.h> #define KSYM_NAME_LEN 512 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s %s]") + \ (KSYM_NAME_LEN - 1) + \ 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + \ (BUILD_ID_SIZE_MAX * 2) + 1) struct cred; struct module; static inline int is_kernel_text(unsigned long addr) { if (__is_kernel_text(addr)) return 1; return in_gate_area_no_mm(addr); } static inline int is_kernel(unsigned long addr) { if (__is_kernel(addr)) return 1; return in_gate_area_no_mm(addr); } static inline int is_ksym_addr(unsigned long addr) { if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) return is_kernel(addr); return is_kernel_text(addr) || is_kernel_inittext(addr); } static inline void *dereference_symbol_descriptor(void *ptr) { #ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS struct module *mod; ptr = dereference_kernel_function_descriptor(ptr); if (is_ksym_addr((unsigned long)ptr)) return ptr; preempt_disable(); mod = __module_address((unsigned long)ptr); preempt_enable(); if (mod) ptr = dereference_module_function_descriptor(mod, ptr); #endif return ptr; } /* How and when do we show kallsyms values? */ extern bool kallsyms_show_value(const struct cred *cred); #ifdef CONFIG_KALLSYMS unsigned long kallsyms_sym_address(int idx); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long), void *data); int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), const char *name, void *data); /* Lookup the address for a symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name); extern int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset); /* Lookup an address. modname is set to NULL if it's in the kernel. */ const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf); /* Look up a kernel symbol and return it in a text buffer. */ extern int sprint_symbol(char *buffer, unsigned long address); extern int sprint_symbol_build_id(char *buffer, unsigned long address); extern int sprint_symbol_no_offset(char *buffer, unsigned long address); extern int sprint_backtrace(char *buffer, unsigned long address); extern int sprint_backtrace_build_id(char *buffer, unsigned long address); int lookup_symbol_name(unsigned long addr, char *symname); #else /* !CONFIG_KALLSYMS */ static inline unsigned long kallsyms_lookup_name(const char *name) { return 0; } static inline int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { return 0; } static inline const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf) { return NULL; } static inline int sprint_symbol(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_symbol_build_id(char *buffer, unsigned long address) { *buffer = '\0'; return 0; } static inline int sprint_symbol_no_offset(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_backtrace(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_backtrace_build_id(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int lookup_symbol_name(unsigned long addr, char *symname) { return -ERANGE; } static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long), void *data) { return -EOPNOTSUPP; } static inline int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), const char *name, void *data) { return -EOPNOTSUPP; } #endif /*CONFIG_KALLSYMS*/ static inline void print_ip_sym(const char *loglvl, unsigned long ip) { printk("%s[<%px>] %pS\n", loglvl, (void *) ip, (void *) ip); } #endif /*_LINUX_KALLSYMS_H*/
8 8 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_IOMAP_H #define LINUX_IOMAP_H 1 #include <linux/atomic.h> #include <linux/bitmap.h> #include <linux/blk_types.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/mm_types.h> #include <linux/blkdev.h> struct address_space; struct fiemap_extent_info; struct inode; struct iomap_iter; struct iomap_dio; struct iomap_writepage_ctx; struct iov_iter; struct kiocb; struct page; struct vm_area_struct; struct vm_fault; /* * Types of block ranges for iomap mappings: */ #define IOMAP_HOLE 0 /* no blocks allocated, need allocation */ #define IOMAP_DELALLOC 1 /* delayed allocation blocks */ #define IOMAP_MAPPED 2 /* blocks allocated at @addr */ #define IOMAP_UNWRITTEN 3 /* blocks allocated at @addr in unwritten state */ #define IOMAP_INLINE 4 /* data inline in the inode */ /* * Flags reported by the file system from iomap_begin: * * IOMAP_F_NEW indicates that the blocks have been newly allocated and need * zeroing for areas that no data is copied to. * * IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access * written data and requires fdatasync to commit them to persistent storage. * This needs to take into account metadata changes that *may* be made at IO * completion, such as file size updates from direct IO. * * IOMAP_F_SHARED indicates that the blocks are shared, and will need to be * unshared as part a write. * * IOMAP_F_MERGED indicates that the iomap contains the merge of multiple block * mappings. * * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of * buffer heads for this mapping. * * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent * rather than a file data extent. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) #define IOMAP_F_SHARED (1U << 2) #define IOMAP_F_MERGED (1U << 3) #ifdef CONFIG_BUFFER_HEAD #define IOMAP_F_BUFFER_HEAD (1U << 4) #else #define IOMAP_F_BUFFER_HEAD 0 #endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) /* * Flags set by the core iomap code during operations: * * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size * has changed as the result of this write operation. * * IOMAP_F_STALE indicates that the iomap is not valid any longer and the file * range it covers needs to be remapped by the high level before the operation * can proceed. */ #define IOMAP_F_SIZE_CHANGED (1U << 8) #define IOMAP_F_STALE (1U << 9) /* * Flags from 0x1000 up are for file system specific usage: */ #define IOMAP_F_PRIVATE (1U << 12) /* * Magic value for addr: */ #define IOMAP_NULL_ADDR -1ULL /* addr is not valid */ struct iomap_folio_ops; struct iomap { u64 addr; /* disk offset of mapping, bytes */ loff_t offset; /* file offset of mapping, bytes */ u64 length; /* length of mapping, bytes */ u16 type; /* type of mapping */ u16 flags; /* flags for mapping */ struct block_device *bdev; /* block device for I/O */ struct dax_device *dax_dev; /* dax_dev for dax operations */ void *inline_data; void *private; /* filesystem private */ const struct iomap_folio_ops *folio_ops; u64 validity_cookie; /* used with .iomap_valid() */ }; static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) { return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; } /* * Returns the inline data pointer for logical offset @pos. */ static inline void *iomap_inline_data(const struct iomap *iomap, loff_t pos) { return iomap->inline_data + pos - iomap->offset; } /* * Check if the mapping's length is within the valid range for inline data. * This is used to guard against accessing data beyond the page inline_data * points at. */ static inline bool iomap_inline_data_valid(const struct iomap *iomap) { return iomap->length <= PAGE_SIZE - offset_in_page(iomap->inline_data); } /* * When a filesystem sets folio_ops in an iomap mapping it returns, get_folio * and put_folio will be called for each folio written to. This only applies * to buffered writes as unbuffered writes will not typically have folios * associated with them. * * When get_folio succeeds, put_folio will always be called to do any * cleanup work necessary. put_folio is responsible for unlocking and putting * @folio. */ struct iomap_folio_ops { struct folio *(*get_folio)(struct iomap_iter *iter, loff_t pos, unsigned len); void (*put_folio)(struct inode *inode, loff_t pos, unsigned copied, struct folio *folio); /* * Check that the cached iomap still maps correctly to the filesystem's * internal extent map. FS internal extent maps can change while iomap * is iterating a cached iomap, so this hook allows iomap to detect that * the iomap needs to be refreshed during a long running write * operation. * * The filesystem can store internal state (e.g. a sequence number) in * iomap->validity_cookie when the iomap is first mapped to be able to * detect changes between mapping time and whenever .iomap_valid() is * called. * * This is called with the folio over the specified file position held * locked by the iomap code. */ bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap); }; /* * Flags for iomap_begin / iomap_end. No flag implies a read. */ #define IOMAP_WRITE (1 << 0) /* writing, must allocate blocks */ #define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ #define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ #define IOMAP_FAULT (1 << 3) /* mapping for page fault */ #define IOMAP_DIRECT (1 << 4) /* direct I/O */ #define IOMAP_NOWAIT (1 << 5) /* do not block */ #define IOMAP_OVERWRITE_ONLY (1 << 6) /* only pure overwrites allowed */ #define IOMAP_UNSHARE (1 << 7) /* unshare_file_range */ #ifdef CONFIG_FS_DAX #define IOMAP_DAX (1 << 8) /* DAX mapping */ #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ struct iomap_ops { /* * Return the existing mapping at pos, or reserve space starting at * pos for up to length, as long as we can do it as a single mapping. * The actual length is returned in iomap->length. */ int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap); /* * Commit and/or unreserve space previous allocated using iomap_begin. * Written indicates the length of the successful write operation which * needs to be commited, while the rest needs to be unreserved. * Written might be zero if no data was written. */ int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap); }; /** * struct iomap_iter - Iterate through a range of a file * @inode: Set at the start of the iteration and should not change. * @pos: The current file position we are operating on. It is updated by * calls to iomap_iter(). Treat as read-only in the body. * @len: The remaining length of the file segment we're operating on. * It is updated at the same time as @pos. * @processed: The number of bytes processed by the body in the most recent * iteration, or a negative errno. 0 causes the iteration to stop. * @flags: Zero or more of the iomap_begin flags above. * @iomap: Map describing the I/O iteration * @srcmap: Source map for COW operations */ struct iomap_iter { struct inode *inode; loff_t pos; u64 len; s64 processed; unsigned flags; struct iomap iomap; struct iomap srcmap; void *private; }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); /** * iomap_length - length of the current iomap iteration * @iter: iteration structure * * Returns the length that the operation applies to for the current iteration. */ static inline u64 iomap_length(const struct iomap_iter *iter) { u64 end = iter->iomap.offset + iter->iomap.length; if (iter->srcmap.type != IOMAP_HOLE) end = min(end, iter->srcmap.offset + iter->srcmap.length); return min(iter->len, end - iter->pos); } /** * iomap_iter_srcmap - return the source map for the current iomap iteration * @i: iteration structure * * Write operations on file systems with reflink support might require a * source and a destination map. This function retourns the source map * for a given operation, which may or may no be identical to the destination * map in &i->iomap. */ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i) { if (i->srcmap.type != IOMAP_HOLE) return &i->srcmap; return &i->iomap; } ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); int iomap_file_buffered_write_punch_delalloc(struct inode *inode, struct iomap *iomap, loff_t pos, loff_t length, ssize_t written, int (*punch)(struct inode *inode, loff_t pos, loff_t length)); int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, const struct iomap_ops *ops); loff_t iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops); loff_t iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops); sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops); /* * Structure for writeback I/O completions. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_type; u16 io_flags; /* IOMAP_F_* */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ struct bio io_bio; /* MUST BE LAST! */ }; static inline struct iomap_ioend *iomap_ioend_from_bio(struct bio *bio) { return container_of(bio, struct iomap_ioend, io_bio); } struct iomap_writeback_ops { /* * Required, maps the blocks so that writeback can be performed on * the range starting at offset. * * Can return arbitrarily large regions, but we need to call into it at * least once per folio to allow the file systems to synchronize with * the write path that could be invalidating mappings. * * An existing mapping from a previous call to this method can be reused * by the file system if it is still valid. */ int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset, unsigned len); /* * Optional, allows the file systems to perform actions just before * submitting the bio and/or override the bio end_io handler for complex * operations like copy on write extent manipulation or unwritten extent * conversions. */ int (*prepare_ioend)(struct iomap_ioend *ioend, int status); /* * Optional, allows the file system to discard state on a page where * we failed to submit any I/O. */ void (*discard_folio)(struct folio *folio, loff_t pos); }; struct iomap_writepage_ctx { struct iomap iomap; struct iomap_ioend *ioend; const struct iomap_writeback_ops *ops; u32 nr_folios; /* folios added to the ioend */ }; void iomap_finish_ioends(struct iomap_ioend *ioend, int error); void iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends); void iomap_sort_ioends(struct list_head *ioend_list); int iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, struct iomap_writepage_ctx *wpc, const struct iomap_writeback_ops *ops); /* * Flags for direct I/O ->end_io: */ #define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */ #define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */ struct iomap_dio_ops { int (*end_io)(struct kiocb *iocb, ssize_t size, int error, unsigned flags); void (*submit_io)(const struct iomap_iter *iter, struct bio *bio, loff_t file_offset); /* * Filesystems wishing to attach private information to a direct io bio * must provide a ->submit_io method that attaches the additional * information to the bio and changes the ->bi_end_io callback to a * custom function. This function should, at a minimum, perform any * relevant post-processing of the bio and end with a call to * iomap_dio_bio_end_io. */ struct bio_set *bio_set; }; /* * Wait for the I/O to complete in iomap_dio_rw even if the kiocb is not * synchronous. */ #define IOMAP_DIO_FORCE_WAIT (1 << 0) /* * Do not allocate blocks or zero partial blocks, but instead fall back to * the caller by returning -EAGAIN. Used to optimize direct I/O writes that * are not aligned to the file system block size. */ #define IOMAP_DIO_OVERWRITE_ONLY (1 << 1) /* * When a page fault occurs, return a partial synchronous result and allow * the caller to retry the rest of the operation after dealing with the page * fault. */ #define IOMAP_DIO_PARTIAL (1 << 2) ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); ssize_t iomap_dio_complete(struct iomap_dio *dio); void iomap_dio_bio_end_io(struct bio *bio); #ifdef CONFIG_SWAP struct file; struct swap_info_struct; int iomap_swapfile_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *pagespan, const struct iomap_ops *ops); #else # define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO) #endif /* CONFIG_SWAP */ #endif /* LINUX_IOMAP_H */
7 30 73 73 73 8 73 85 4 8 6 6 2 98 2 73 14 10 2 2 2 1 1 12 1 1 1 1 12 9 327 223 44 101 54 10 54 2 56 6 312 391 316 534 147 147 5 373 130 177 160 116 22 97 192 265 21 67 125 124 125 111 25 83 25 63 63 26 25 144 144 1 146 146 148 18 55 55 9 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _NET_IPV6_H #define _NET_IPV6_H #include <linux/ipv6.h> #include <linux/hardirq.h> #include <linux/jhash.h> #include <linux/refcount.h> #include <linux/jump_label_ratelimit.h> #include <net/if_inet6.h> #include <net/flow.h> #include <net/flow_dissector.h> #include <net/inet_dscp.h> #include <net/snmp.h> #include <net/netns/hash.h> struct ip_tunnel_info; #define SIN6_LEN_RFC2133 24 #define IPV6_MAXPLEN 65535 /* * NextHeader field of IPv6 header */ #define NEXTHDR_HOP 0 /* Hop-by-hop option header. */ #define NEXTHDR_IPV4 4 /* IPv4 in IPv6 */ #define NEXTHDR_TCP 6 /* TCP segment. */ #define NEXTHDR_UDP 17 /* UDP message. */ #define NEXTHDR_IPV6 41 /* IPv6 in IPv6 */ #define NEXTHDR_ROUTING 43 /* Routing header. */ #define NEXTHDR_FRAGMENT 44 /* Fragmentation/reassembly header. */ #define NEXTHDR_GRE 47 /* GRE header. */ #define NEXTHDR_ESP 50 /* Encapsulating security payload. */ #define NEXTHDR_AUTH 51 /* Authentication header. */ #define NEXTHDR_ICMP 58 /* ICMP for IPv6. */ #define NEXTHDR_NONE 59 /* No next header */ #define NEXTHDR_DEST 60 /* Destination options header. */ #define NEXTHDR_SCTP 132 /* SCTP message. */ #define NEXTHDR_MOBILITY 135 /* Mobility header. */ #define NEXTHDR_MAX 255 #define IPV6_DEFAULT_HOPLIMIT 64 #define IPV6_DEFAULT_MCASTHOPS 1 /* Limits on Hop-by-Hop and Destination options. * * Per RFC8200 there is no limit on the maximum number or lengths of options in * Hop-by-Hop or Destination options other then the packet must fit in an MTU. * We allow configurable limits in order to mitigate potential denial of * service attacks. * * There are three limits that may be set: * - Limit the number of options in a Hop-by-Hop or Destination options * extension header * - Limit the byte length of a Hop-by-Hop or Destination options extension * header * - Disallow unknown options * * The limits are expressed in corresponding sysctls: * * ipv6.sysctl.max_dst_opts_cnt * ipv6.sysctl.max_hbh_opts_cnt * ipv6.sysctl.max_dst_opts_len * ipv6.sysctl.max_hbh_opts_len * * max_*_opts_cnt is the number of TLVs that are allowed for Destination * options or Hop-by-Hop options. If the number is less than zero then unknown * TLVs are disallowed and the number of known options that are allowed is the * absolute value. Setting the value to INT_MAX indicates no limit. * * max_*_opts_len is the length limit in bytes of a Destination or * Hop-by-Hop options extension header. Setting the value to INT_MAX * indicates no length limit. * * If a limit is exceeded when processing an extension header the packet is * silently discarded. */ /* Default limits for Hop-by-Hop and Destination options */ #define IP6_DEFAULT_MAX_DST_OPTS_CNT 8 #define IP6_DEFAULT_MAX_HBH_OPTS_CNT 8 #define IP6_DEFAULT_MAX_DST_OPTS_LEN INT_MAX /* No limit */ #define IP6_DEFAULT_MAX_HBH_OPTS_LEN INT_MAX /* No limit */ /* * Addr type * * type - unicast | multicast * scope - local | site | global * v4 - compat * v4mapped * any * loopback */ #define IPV6_ADDR_ANY 0x0000U #define IPV6_ADDR_UNICAST 0x0001U #define IPV6_ADDR_MULTICAST 0x0002U #define IPV6_ADDR_LOOPBACK 0x0010U #define IPV6_ADDR_LINKLOCAL 0x0020U #define IPV6_ADDR_SITELOCAL 0x0040U #define IPV6_ADDR_COMPATv4 0x0080U #define IPV6_ADDR_SCOPE_MASK 0x00f0U #define IPV6_ADDR_MAPPED 0x1000U /* * Addr scopes */ #define IPV6_ADDR_MC_SCOPE(a) \ ((a)->s6_addr[1] & 0x0f) /* nonstandard */ #define __IPV6_ADDR_SCOPE_INVALID -1 #define IPV6_ADDR_SCOPE_NODELOCAL 0x01 #define IPV6_ADDR_SCOPE_LINKLOCAL 0x02 #define IPV6_ADDR_SCOPE_SITELOCAL 0x05 #define IPV6_ADDR_SCOPE_ORGLOCAL 0x08 #define IPV6_ADDR_SCOPE_GLOBAL 0x0e /* * Addr flags */ #define IPV6_ADDR_MC_FLAG_TRANSIENT(a) \ ((a)->s6_addr[1] & 0x10) #define IPV6_ADDR_MC_FLAG_PREFIX(a) \ ((a)->s6_addr[1] & 0x20) #define IPV6_ADDR_MC_FLAG_RENDEZVOUS(a) \ ((a)->s6_addr[1] & 0x40) /* * fragmentation header */ struct frag_hdr { __u8 nexthdr; __u8 reserved; __be16 frag_off; __be32 identification; }; /* * Jumbo payload option, as described in RFC 2675 2. */ struct hop_jumbo_hdr { u8 nexthdr; u8 hdrlen; u8 tlv_type; /* IPV6_TLV_JUMBO, 0xC2 */ u8 tlv_len; /* 4 */ __be32 jumbo_payload_len; }; #define IP6_MF 0x0001 #define IP6_OFFSET 0xFFF8 struct ip6_fraglist_iter { struct ipv6hdr *tmp_hdr; struct sk_buff *frag; int offset; unsigned int hlen; __be32 frag_id; u8 nexthdr; }; int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_fraglist_iter *iter); void ip6_fraglist_prepare(struct sk_buff *skb, struct ip6_fraglist_iter *iter); static inline struct sk_buff *ip6_fraglist_next(struct ip6_fraglist_iter *iter) { struct sk_buff *skb = iter->frag; iter->frag = skb->next; skb_mark_not_on_list(skb); return skb; } struct ip6_frag_state { u8 *prevhdr; unsigned int hlen; unsigned int mtu; unsigned int left; int offset; int ptr; int hroom; int troom; __be32 frag_id; u8 nexthdr; }; void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state); struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state); #define IP6_REPLY_MARK(net, mark) \ ((net)->ipv6.sysctl.fwmark_reflect ? (mark) : 0) #include <net/sock.h> /* sysctls */ extern int sysctl_mld_max_msf; extern int sysctl_mld_qrv; #define _DEVINC(net, statname, mod, idev, field) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ mod##SNMP_INC_STATS64((_idev)->stats.statname, (field));\ mod##SNMP_INC_STATS64((net)->mib.statname##_statistics, (field));\ }) /* per device counters are atomic_long_t */ #define _DEVINCATOMIC(net, statname, mod, idev, field) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \ mod##SNMP_INC_STATS((net)->mib.statname##_statistics, (field));\ }) /* per device and per net counters are atomic_long_t */ #define _DEVINC_ATOMIC_ATOMIC(net, statname, idev, field) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \ SNMP_INC_STATS_ATOMIC_LONG((net)->mib.statname##_statistics, (field));\ }) #define _DEVADD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ mod##SNMP_ADD_STATS((_idev)->stats.statname, (field), (val)); \ mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, (field), (val));\ }) #define _DEVUPD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, (val)); \ mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\ }) /* MIBs */ #define IP6_INC_STATS(net, idev,field) \ _DEVINC(net, ipv6, , idev, field) #define __IP6_INC_STATS(net, idev,field) \ _DEVINC(net, ipv6, __, idev, field) #define IP6_ADD_STATS(net, idev,field,val) \ _DEVADD(net, ipv6, , idev, field, val) #define __IP6_ADD_STATS(net, idev,field,val) \ _DEVADD(net, ipv6, __, idev, field, val) #define IP6_UPD_PO_STATS(net, idev,field,val) \ _DEVUPD(net, ipv6, , idev, field, val) #define __IP6_UPD_PO_STATS(net, idev,field,val) \ _DEVUPD(net, ipv6, __, idev, field, val) #define ICMP6_INC_STATS(net, idev, field) \ _DEVINCATOMIC(net, icmpv6, , idev, field) #define __ICMP6_INC_STATS(net, idev, field) \ _DEVINCATOMIC(net, icmpv6, __, idev, field) #define ICMP6MSGOUT_INC_STATS(net, idev, field) \ _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field +256) #define ICMP6MSGIN_INC_STATS(net, idev, field) \ _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field) struct ip6_ra_chain { struct ip6_ra_chain *next; struct sock *sk; int sel; void (*destructor)(struct sock *); }; extern struct ip6_ra_chain *ip6_ra_chain; extern rwlock_t ip6_ra_lock; /* This structure is prepared by protocol, when parsing ancillary data and passed to IPv6. */ struct ipv6_txoptions { refcount_t refcnt; /* Length of this structure */ int tot_len; /* length of extension headers */ __u16 opt_flen; /* after fragment hdr */ __u16 opt_nflen; /* before fragment hdr */ struct ipv6_opt_hdr *hopopt; struct ipv6_opt_hdr *dst0opt; struct ipv6_rt_hdr *srcrt; /* Routing Header */ struct ipv6_opt_hdr *dst1opt; struct rcu_head rcu; /* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */ }; /* flowlabel_reflect sysctl values */ enum flowlabel_reflect { FLOWLABEL_REFLECT_ESTABLISHED = 1, FLOWLABEL_REFLECT_TCP_RESET = 2, FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES = 4, }; struct ip6_flowlabel { struct ip6_flowlabel __rcu *next; __be32 label; atomic_t users; struct in6_addr dst; struct ipv6_txoptions *opt; unsigned long linger; struct rcu_head rcu; u8 share; union { struct pid *pid; kuid_t uid; } owner; unsigned long lastuse; unsigned long expires; struct net *fl_net; }; #define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF) #define IPV6_FLOWLABEL_MASK cpu_to_be32(0x000FFFFF) #define IPV6_FLOWLABEL_STATELESS_FLAG cpu_to_be32(0x00080000) #define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) #define IPV6_TCLASS_SHIFT 20 struct ipv6_fl_socklist { struct ipv6_fl_socklist __rcu *next; struct ip6_flowlabel *fl; struct rcu_head rcu; }; struct ipcm6_cookie { struct sockcm_cookie sockc; __s16 hlimit; __s16 tclass; __u16 gso_size; __s8 dontfrag; struct ipv6_txoptions *opt; }; static inline void ipcm6_init(struct ipcm6_cookie *ipc6) { *ipc6 = (struct ipcm6_cookie) { .hlimit = -1, .tclass = -1, .dontfrag = -1, }; } static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, const struct sock *sk) { *ipc6 = (struct ipcm6_cookie) { .hlimit = -1, .tclass = inet6_sk(sk)->tclass, .dontfrag = inet6_test_bit(DONTFRAG, sk), }; } static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) { struct ipv6_txoptions *opt; rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) { if (!refcount_inc_not_zero(&opt->refcnt)) opt = NULL; else opt = rcu_pointer_handoff(opt); } rcu_read_unlock(); return opt; } static inline void txopt_put(struct ipv6_txoptions *opt) { if (opt && refcount_dec_and_test(&opt->refcnt)) kfree_rcu(opt, rcu); } #if IS_ENABLED(CONFIG_IPV6) struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label); extern struct static_key_false_deferred ipv6_flowlabel_exclusive; static inline struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) { if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key) && READ_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl)) return __fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT); return NULL; } #endif struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, struct ipv6_txoptions *fopt); void fl6_free_socklist(struct sock *sk); int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen); int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags); int ip6_flowlabel_init(void); void ip6_flowlabel_cleanup(void); bool ip6_autoflowlabel(struct net *net, const struct sock *sk); static inline void fl6_sock_release(struct ip6_flowlabel *fl) { if (fl) atomic_dec(&fl->users); } enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info); void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct icmp6hdr *thdr, int len); int ip6_ra_control(struct sock *sk, int sel); int ipv6_parse_hopopts(struct sk_buff *skb); struct ipv6_txoptions *ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt); struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, struct ipv6_opt_hdr *newopt); struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); static inline struct ipv6_txoptions * ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt) { if (!opt) return NULL; return __ipv6_fixup_options(opt_space, opt); } bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct inet6_skb_parm *opt); struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt); /* This helper is specialized for BIG TCP needs. * It assumes the hop_jumbo_hdr will immediately follow the IPV6 header. * It assumes headers are already in skb->head. * Returns 0, or IPPROTO_TCP if a BIG TCP packet is there. */ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) { const struct hop_jumbo_hdr *jhdr; const struct ipv6hdr *nhdr; if (likely(skb->len <= GRO_LEGACY_MAX_SIZE)) return 0; if (skb->protocol != htons(ETH_P_IPV6)) return 0; if (skb_network_offset(skb) + sizeof(struct ipv6hdr) + sizeof(struct hop_jumbo_hdr) > skb_headlen(skb)) return 0; nhdr = ipv6_hdr(skb); if (nhdr->nexthdr != NEXTHDR_HOP) return 0; jhdr = (const struct hop_jumbo_hdr *) (nhdr + 1); if (jhdr->tlv_type != IPV6_TLV_JUMBO || jhdr->hdrlen != 0 || jhdr->nexthdr != IPPROTO_TCP) return 0; return jhdr->nexthdr; } /* Return 0 if HBH header is successfully removed * Or if HBH removal is unnecessary (packet is not big TCP) * Return error to indicate dropping the packet */ static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb) { const int hophdr_len = sizeof(struct hop_jumbo_hdr); int nexthdr = ipv6_has_hopopt_jumbo(skb); struct ipv6hdr *h6; if (!nexthdr) return 0; if (skb_cow_head(skb, 0)) return -1; /* Remove the HBH header. * Layout: [Ethernet header][IPv6 header][HBH][L4 Header] */ memmove(skb_mac_header(skb) + hophdr_len, skb_mac_header(skb), skb_network_header(skb) - skb_mac_header(skb) + sizeof(struct ipv6hdr)); __skb_pull(skb, hophdr_len); skb->network_header += hophdr_len; skb->mac_header += hophdr_len; h6 = ipv6_hdr(skb); h6->nexthdr = nexthdr; return 0; } static inline bool ipv6_accept_ra(const struct inet6_dev *idev) { s32 accept_ra = READ_ONCE(idev->cnf.accept_ra); /* If forwarding is enabled, RA are not accepted unless the special * hybrid mode (accept_ra=2) is enabled. */ return READ_ONCE(idev->cnf.forwarding) ? accept_ra == 2 : accept_ra; } #define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */ #define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */ #define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */ int __ipv6_addr_type(const struct in6_addr *addr); static inline int ipv6_addr_type(const struct in6_addr *addr) { return __ipv6_addr_type(addr) & 0xffff; } static inline int ipv6_addr_scope(const struct in6_addr *addr) { return __ipv6_addr_type(addr) & IPV6_ADDR_SCOPE_MASK; } static inline int __ipv6_addr_src_scope(int type) { return (type == IPV6_ADDR_ANY) ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16); } static inline int ipv6_addr_src_scope(const struct in6_addr *addr) { return __ipv6_addr_src_scope(__ipv6_addr_type(addr)); } static inline bool __ipv6_addr_needs_scope_id(int type) { return type & IPV6_ADDR_LINKLOCAL || (type & IPV6_ADDR_MULTICAST && (type & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL))); } static inline __u32 ipv6_iface_scope_id(const struct in6_addr *addr, int iface) { return __ipv6_addr_needs_scope_id(__ipv6_addr_type(addr)) ? iface : 0; } static inline int ipv6_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2) { return memcmp(a1, a2, sizeof(struct in6_addr)); } static inline bool ipv6_masked_addr_cmp(const struct in6_addr *a1, const struct in6_addr *m, const struct in6_addr *a2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul1 = (const unsigned long *)a1; const unsigned long *ulm = (const unsigned long *)m; const unsigned long *ul2 = (const unsigned long *)a2; return !!(((ul1[0] ^ ul2[0]) & ulm[0]) | ((ul1[1] ^ ul2[1]) & ulm[1])); #else return !!(((a1->s6_addr32[0] ^ a2->s6_addr32[0]) & m->s6_addr32[0]) | ((a1->s6_addr32[1] ^ a2->s6_addr32[1]) & m->s6_addr32[1]) | ((a1->s6_addr32[2] ^ a2->s6_addr32[2]) & m->s6_addr32[2]) | ((a1->s6_addr32[3] ^ a2->s6_addr32[3]) & m->s6_addr32[3])); #endif } static inline void ipv6_addr_prefix(struct in6_addr *pfx, const struct in6_addr *addr, int plen) { /* caller must guarantee 0 <= plen <= 128 */ int o = plen >> 3, b = plen & 0x7; memset(pfx->s6_addr, 0, sizeof(pfx->s6_addr)); memcpy(pfx->s6_addr, addr, o); if (b != 0) pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b); } static inline void ipv6_addr_prefix_copy(struct in6_addr *addr, const struct in6_addr *pfx, int plen) { /* caller must guarantee 0 <= plen <= 128 */ int o = plen >> 3, b = plen & 0x7; memcpy(addr->s6_addr, pfx, o); if (b != 0) { addr->s6_addr[o] &= ~(0xff00 >> b); addr->s6_addr[o] |= (pfx->s6_addr[o] & (0xff00 >> b)); } } static inline void __ipv6_addr_set_half(__be32 *addr, __be32 wh, __be32 wl) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #if defined(__BIG_ENDIAN) if (__builtin_constant_p(wh) && __builtin_constant_p(wl)) { *(__force u64 *)addr = ((__force u64)(wh) << 32 | (__force u64)(wl)); return; } #elif defined(__LITTLE_ENDIAN) if (__builtin_constant_p(wl) && __builtin_constant_p(wh)) { *(__force u64 *)addr = ((__force u64)(wl) << 32 | (__force u64)(wh)); return; } #endif #endif addr[0] = wh; addr[1] = wl; } static inline void ipv6_addr_set(struct in6_addr *addr, __be32 w1, __be32 w2, __be32 w3, __be32 w4) { __ipv6_addr_set_half(&addr->s6_addr32[0], w1, w2); __ipv6_addr_set_half(&addr->s6_addr32[2], w3, w4); } static inline bool ipv6_addr_equal(const struct in6_addr *a1, const struct in6_addr *a2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul1 = (const unsigned long *)a1; const unsigned long *ul2 = (const unsigned long *)a2; return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL; #else return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) | (a1->s6_addr32[1] ^ a2->s6_addr32[1]) | (a1->s6_addr32[2] ^ a2->s6_addr32[2]) | (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0; #endif } #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 static inline bool __ipv6_prefix_equal64_half(const __be64 *a1, const __be64 *a2, unsigned int len) { if (len && ((*a1 ^ *a2) & cpu_to_be64((~0UL) << (64 - len)))) return false; return true; } static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, const struct in6_addr *addr2, unsigned int prefixlen) { const __be64 *a1 = (const __be64 *)addr1; const __be64 *a2 = (const __be64 *)addr2; if (prefixlen >= 64) { if (a1[0] ^ a2[0]) return false; return __ipv6_prefix_equal64_half(a1 + 1, a2 + 1, prefixlen - 64); } return __ipv6_prefix_equal64_half(a1, a2, prefixlen); } #else static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, const struct in6_addr *addr2, unsigned int prefixlen) { const __be32 *a1 = addr1->s6_addr32; const __be32 *a2 = addr2->s6_addr32; unsigned int pdw, pbi; /* check complete u32 in prefix */ pdw = prefixlen >> 5; if (pdw && memcmp(a1, a2, pdw << 2)) return false; /* check incomplete u32 in prefix */ pbi = prefixlen & 0x1f; if (pbi && ((a1[pdw] ^ a2[pdw]) & htonl((0xffffffff) << (32 - pbi)))) return false; return true; } #endif static inline bool ipv6_addr_any(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul = (const unsigned long *)a; return (ul[0] | ul[1]) == 0UL; #else return (a->s6_addr32[0] | a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]) == 0; #endif } static inline u32 ipv6_addr_hash(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul = (const unsigned long *)a; unsigned long x = ul[0] ^ ul[1]; return (u32)(x ^ (x >> 32)); #else return (__force u32)(a->s6_addr32[0] ^ a->s6_addr32[1] ^ a->s6_addr32[2] ^ a->s6_addr32[3]); #endif } /* more secured version of ipv6_addr_hash() */ static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 initval) { return jhash2((__force const u32 *)a->s6_addr32, ARRAY_SIZE(a->s6_addr32), initval); } static inline bool ipv6_addr_loopback(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const __be64 *be = (const __be64 *)a; return (be[0] | (be[1] ^ cpu_to_be64(1))) == 0UL; #else return (a->s6_addr32[0] | a->s6_addr32[1] | a->s6_addr32[2] | (a->s6_addr32[3] ^ cpu_to_be32(1))) == 0; #endif } /* * Note that we must __force cast these to unsigned long to make sparse happy, * since all of the endian-annotated types are fixed size regardless of arch. */ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) { return ( #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 *(unsigned long *)a | #else (__force unsigned long)(a->s6_addr32[0] | a->s6_addr32[1]) | #endif (__force unsigned long)(a->s6_addr32[2] ^ cpu_to_be32(0x0000ffff))) == 0UL; } static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a) { return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]); } static inline u32 ipv6_portaddr_hash(const struct net *net, const struct in6_addr *addr6, unsigned int port) { unsigned int hash, mix = net_hash_mix(net); if (ipv6_addr_any(addr6)) hash = jhash_1word(0, mix); else if (ipv6_addr_v4mapped(addr6)) hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); else hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); return hash ^ port; } /* * Check for a RFC 4843 ORCHID address * (Overlay Routable Cryptographic Hash Identifiers) */ static inline bool ipv6_addr_orchid(const struct in6_addr *a) { return (a->s6_addr32[0] & htonl(0xfffffff0)) == htonl(0x20010010); } static inline bool ipv6_addr_is_multicast(const struct in6_addr *addr) { return (addr->s6_addr32[0] & htonl(0xFF000000)) == htonl(0xFF000000); } static inline void ipv6_addr_set_v4mapped(const __be32 addr, struct in6_addr *v4mapped) { ipv6_addr_set(v4mapped, 0, 0, htonl(0x0000FFFF), addr); } /* * find the first different bit between two addresses * length of address must be a multiple of 32bits */ static inline int __ipv6_addr_diff32(const void *token1, const void *token2, int addrlen) { const __be32 *a1 = token1, *a2 = token2; int i; addrlen >>= 2; for (i = 0; i < addrlen; i++) { __be32 xb = a1[i] ^ a2[i]; if (xb) return i * 32 + 31 - __fls(ntohl(xb)); } /* * we should *never* get to this point since that * would mean the addrs are equal * * However, we do get to it 8) And exacly, when * addresses are equal 8) * * ip route add 1111::/128 via ... * ip route add 1111::/64 via ... * and we are here. * * Ideally, this function should stop comparison * at prefix length. It does not, but it is still OK, * if returned value is greater than prefix length. * --ANK (980803) */ return addrlen << 5; } #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 static inline int __ipv6_addr_diff64(const void *token1, const void *token2, int addrlen) { const __be64 *a1 = token1, *a2 = token2; int i; addrlen >>= 3; for (i = 0; i < addrlen; i++) { __be64 xb = a1[i] ^ a2[i]; if (xb) return i * 64 + 63 - __fls(be64_to_cpu(xb)); } return addrlen << 6; } #endif static inline int __ipv6_addr_diff(const void *token1, const void *token2, int addrlen) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 if (__builtin_constant_p(addrlen) && !(addrlen & 7)) return __ipv6_addr_diff64(token1, token2, addrlen); #endif return __ipv6_addr_diff32(token1, token2, addrlen); } static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_addr *a2) { return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr)); } __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr); __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb); int ip6_dst_hoplimit(struct dst_entry *dst); static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, struct dst_entry *dst) { int hlimit; if (ipv6_addr_is_multicast(&fl6->daddr)) hlimit = READ_ONCE(np->mcast_hops); else hlimit = READ_ONCE(np->hop_limit); if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); return hlimit; } /* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v6addrs.src = iph->saddr; * flow->v6addrs.dst = iph->daddr; */ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, const struct ipv6hdr *iph) { BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) != offsetof(typeof(flow->addrs), v6addrs.src) + sizeof(flow->addrs.v6addrs.src)); memcpy(&flow->addrs.v6addrs, &iph->addrs, sizeof(flow->addrs.v6addrs)); flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } #if IS_ENABLED(CONFIG_IPV6) static inline bool ipv6_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { return net->ipv6.sysctl.ip_nonlocal_bind || test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) || test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags); } /* Sysctl settings for net ipv6.auto_flowlabels */ #define IP6_AUTO_FLOW_LABEL_OFF 0 #define IP6_AUTO_FLOW_LABEL_OPTOUT 1 #define IP6_AUTO_FLOW_LABEL_OPTIN 2 #define IP6_AUTO_FLOW_LABEL_FORCED 3 #define IP6_AUTO_FLOW_LABEL_MAX IP6_AUTO_FLOW_LABEL_FORCED #define IP6_DEFAULT_AUTO_FLOW_LABELS IP6_AUTO_FLOW_LABEL_OPTOUT static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel, struct flowi6 *fl6) { u32 hash; /* @flowlabel may include more than a flow label, eg, the traffic class. * Here we want only the flow label value. */ flowlabel &= IPV6_FLOWLABEL_MASK; if (flowlabel || net->ipv6.sysctl.auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF || (!autolabel && net->ipv6.sysctl.auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED)) return flowlabel; hash = skb_get_hash_flowi6(skb, fl6); /* Since this is being sent on the wire obfuscate hash a bit * to minimize possbility that any useful information to an * attacker is leaked. Only lower 20 bits are relevant. */ hash = rol32(hash, 16); flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; if (net->ipv6.sysctl.flowlabel_state_ranges) flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG; return flowlabel; } static inline int ip6_default_np_autolabel(struct net *net) { switch (net->ipv6.sysctl.auto_flowlabels) { case IP6_AUTO_FLOW_LABEL_OFF: case IP6_AUTO_FLOW_LABEL_OPTIN: default: return 0; case IP6_AUTO_FLOW_LABEL_OPTOUT: case IP6_AUTO_FLOW_LABEL_FORCED: return 1; } } #else static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel, struct flowi6 *fl6) { return flowlabel; } static inline int ip6_default_np_autolabel(struct net *net) { return 0; } #endif #if IS_ENABLED(CONFIG_IPV6) static inline int ip6_multipath_hash_policy(const struct net *net) { return net->ipv6.sysctl.multipath_hash_policy; } static inline u32 ip6_multipath_hash_fields(const struct net *net) { return net->ipv6.sysctl.multipath_hash_fields; } #else static inline int ip6_multipath_hash_policy(const struct net *net) { return 0; } static inline u32 ip6_multipath_hash_fields(const struct net *net) { return 0; } #endif /* * Header manipulation */ static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass, __be32 flowlabel) { *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | flowlabel; } static inline __be32 ip6_flowinfo(const struct ipv6hdr *hdr) { return *(__be32 *)hdr & IPV6_FLOWINFO_MASK; } static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr) { return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK; } static inline u8 ip6_tclass(__be32 flowinfo) { return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT; } static inline dscp_t ip6_dscp(__be32 flowinfo) { return inet_dsfield_to_dscp(ip6_tclass(flowinfo)); } static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel) { return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel; } static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6) { return fl6->flowlabel & IPV6_FLOWLABEL_MASK; } /* * Prototypes exported by ipv6 */ /* * rcv function (called from netdevice level) */ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev); int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb); /* * upper-layer output functions */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr); int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags); int ip6_push_pending_frames(struct sock *sk); void ip6_flush_pending_frames(struct sock *sk); int ip6_send_skb(struct sk_buff *skb); struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork, struct inet6_cork *v6_cork); struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork); static inline struct sk_buff *ip6_finish_skb(struct sock *sk) { return __ip6_make_skb(sk, &sk->sk_write_queue, &inet_sk(sk)->cork, &inet6_sk(sk)->cork); } int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst); struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst, bool connected); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *orig_dst); /* * skb processing functions */ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip6_forward(struct sk_buff *skb); int ip6_input(struct sk_buff *skb); int ip6_mc_input(struct sk_buff *skb); void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr, bool have_final); int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); /* * Extension header (options) processing */ void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto, struct in6_addr **daddr_p, struct in6_addr *saddr); void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto); int ipv6_skip_exthdr(const struct sk_buff *, int start, u8 *nexthdrp, __be16 *frag_offp); bool ipv6_ext_hdr(u8 nexthdr); enum { IP6_FH_F_FRAG = (1 << 0), IP6_FH_F_AUTH = (1 << 1), IP6_FH_F_SKIP_RH = (1 << 2), }; /* find specified header and get offset to it */ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, unsigned short *fragoff, int *fragflg); int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type); struct in6_addr *fl6_update_dst(struct flowi6 *fl6, const struct ipv6_txoptions *opt, struct in6_addr *orig); /* * socket options (ipv6_sockglue.c) */ DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount); int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int do_ipv6_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int __ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr); void ip6_datagram_release_cb(struct sock *sk); int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, int *addr_len); void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); void inet6_cleanup_sock(struct sock *sk); void inet6_sock_destruct(struct sock *sk); int inet6_release(struct socket *sock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); /* * reassembly.c */ extern const struct proto_ops inet6_stream_ops; extern const struct proto_ops inet6_dgram_ops; extern const struct proto_ops inet6_sockraw_ops; struct group_source_req; struct group_filter; int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr); int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage *list); int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, sockptr_t optval, size_t ss_offset); #ifdef CONFIG_PROC_FS int ac6_proc_init(struct net *net); void ac6_proc_exit(struct net *net); int raw6_proc_init(void); void raw6_proc_exit(void); int tcp6_proc_init(struct net *net); void tcp6_proc_exit(struct net *net); int udp6_proc_init(struct net *net); void udp6_proc_exit(struct net *net); int udplite6_proc_init(void); void udplite6_proc_exit(void); int ipv6_misc_proc_init(void); void ipv6_misc_proc_exit(void); int snmp6_register_dev(struct inet6_dev *idev); int snmp6_unregister_dev(struct inet6_dev *idev); #else static inline int ac6_proc_init(struct net *net) { return 0; } static inline void ac6_proc_exit(struct net *net) { } static inline int snmp6_register_dev(struct inet6_dev *idev) { return 0; } static inline int snmp6_unregister_dev(struct inet6_dev *idev) { return 0; } #endif #ifdef CONFIG_SYSCTL struct ctl_table *ipv6_icmp_sysctl_init(struct net *net); size_t ipv6_icmp_sysctl_table_size(void); struct ctl_table *ipv6_route_sysctl_init(struct net *net); size_t ipv6_route_sysctl_table_size(struct net *net); int ipv6_sysctl_register(void); void ipv6_sysctl_unregister(void); #endif int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); static inline int ip6_sock_set_v6only(struct sock *sk) { if (inet_sk(sk)->inet_num) return -EINVAL; lock_sock(sk); sk->sk_ipv6only = true; release_sock(sk); return 0; } static inline void ip6_sock_set_recverr(struct sock *sk) { inet6_set_bit(RECVERR6, sk); } #define IPV6_PREFER_SRC_MASK (IPV6_PREFER_SRC_TMP | IPV6_PREFER_SRC_PUBLIC | \ IPV6_PREFER_SRC_COA) static inline int ip6_sock_set_addr_preferences(struct sock *sk, int val) { unsigned int prefmask = ~IPV6_PREFER_SRC_MASK; unsigned int pref = 0; /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ switch (val & (IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP | IPV6_PREFER_SRC_PUBTMP_DEFAULT)) { case IPV6_PREFER_SRC_PUBLIC: pref |= IPV6_PREFER_SRC_PUBLIC; prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP); break; case IPV6_PREFER_SRC_TMP: pref |= IPV6_PREFER_SRC_TMP; prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP); break; case IPV6_PREFER_SRC_PUBTMP_DEFAULT: prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP); break; case 0: break; default: return -EINVAL; } /* check HOME/COA conflicts */ switch (val & (IPV6_PREFER_SRC_HOME | IPV6_PREFER_SRC_COA)) { case IPV6_PREFER_SRC_HOME: prefmask &= ~IPV6_PREFER_SRC_COA; break; case IPV6_PREFER_SRC_COA: pref |= IPV6_PREFER_SRC_COA; break; case 0: break; default: return -EINVAL; } /* check CGA/NONCGA conflicts */ switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) { case IPV6_PREFER_SRC_CGA: case IPV6_PREFER_SRC_NONCGA: case 0: break; default: return -EINVAL; } WRITE_ONCE(inet6_sk(sk)->srcprefs, (READ_ONCE(inet6_sk(sk)->srcprefs) & prefmask) | pref); return 0; } static inline void ip6_sock_set_recvpktinfo(struct sock *sk) { lock_sock(sk); inet6_sk(sk)->rxopt.bits.rxinfo = true; release_sock(sk); } #endif /* _NET_IPV6_H */
123 58 58 58 27 58 58 5 3 100 100 100 100 31 29 31 31 3 31 51 48 51 12 88 2 88 88 88 142 142 140 142 142 142 124 124 142 88 88 88 63 9 63 87 5 88 88 4 4 88 6 88 44 43 1 1 100 99 99 100 1 3 1 1 1 1 1 1 80 51 150 150 14 149 31 119 120 149 150 149 23 3 148 120 150 120 120 99 150 2 149 101 147 6 6 6 6 6 149 33 3 33 33 33 33 150 149 150 100 100 14 14 21 21 21 10 2 2 10 98 16 1 99 100 99 99 100 100 100 100 1 100 33 33 33 100 100 27 27 27 100 3 21 1 20 1 1 1 20 1 21 21 21 21 92 16 2 93 79 79 78 93 21 7 21 93 6 6 93 5 6 5 2 1 1 5 6 332 331 42 42 41 6 42 22 42 14 42 42 34 122 124 123 124 4 4 4 4 4 4 4 4 123 123 117 14 8 2 117 117 130 129 130 119 4 119 119 129 119 129 130 100 88 14 130 130 130 129 130 130 128 11 11 10 3 3 130 88 1 100 129 21 3 130 88 130 51 51 130 130 129 128 130 55 128 36 128 117 78 15 15 15 65 3 25 19 2 2 2 1 2 2 2 2 13 19 19 19 1 1 19 1 1 4 5 5 5 1 1 5 5 5 5 5 5 5 5 4 2 5 5 4 5 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 126 126 123 123 126 1 126 126 125 125 72 72 71 72 18 71 17 17 17 17 17 59 45 42 41 59 55 14 55 54 54 55 19 17 55 48 2 2 6 2 46 23 30 29 66 66 45 66 66 2 64 66 14 13 14 7 7 2 64 64 2 2 2 48 39 23 58 8 11 17 17 17 2 17 17 17 17 17 17 17 13 13 13 7 6 6 4 4 3 3 1 12 66 2 64 2 1 1 1 57 54 57 26 1 57 56 151 2 41 150 70 68 54 136 173 136 125 11 165 57 17 14 14 13 12 12 66 66 66 66 66 66 2 2 2 64 64 12 64 57 64 56 17 49 17 17 17 16 56 17 56 56 56 58 46 58 66 56 66 31 59 59 58 2 58 58 31 79 80 80 80 56 56 56 45 27 56 56 10 55 56 55 56 8 10 8 8 8 10 11 11 11 4 8 3 3 2 2 2 3 3 7 7 11 65 66 14 13 13 13 87 20 87 2 88 88 16 16 16 9 88 75 88 83 5 5 3 5 88 1 13 1 13 2 2 2 2 2 2 2 2 6 6 5 1 1 1 5 4 1 1 5 3 5 2 4 4 3 4 4 2 5 5 5 5 5 4 4 5 5 5 5 4 4 5 4 5 4 5 4 4 4 3 4 4 4 4 4 4 4 1 4 4 3 2 3 3 4 3 2 123 123 122 123 17 17 3 15 15 15 17 4 4 4 1 4 44 44 44 44 43 43 43 43 44 44 21 21 9 20 21 21 12 21 20 21 21 20 21 3 21 21 3 21 14 123 123 123 122 6 123 123 122 123 1 1 123 123 123 5 123 100 24 123 123 100 100 39 39 6 4 6 6 4 4 1 6 5 6 6 4 6 6 6 1 34 34 5 33 122 123 24 100 100 100 100 100 100 39 62 67 100 1 1 100 123 29 28 28 29 24 9 5 10 9 10 24 3 8 24 24 79 79 79 79 79 79 72 11 11 11 11 10 10 10 2 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ /* * Changes: Pedro Roque : Retransmit queue handled by TCP. * : Fragmentation on mtu decrease * : Segment collapse on retransmit * : AF independence * * Linus Torvalds : send_delayed_ack * David S. Miller : Charge memory using the right skb * during syn/ack processing. * David S. Miller : Output engine completely rewritten. * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. * Cacophonix Gaul : draft-minshall-nagle-01 * J Hadi Salim : ECN support * */ #define pr_fmt(fmt) "TCP: " fmt #include <net/tcp.h> #include <net/mptcp.h> #include <net/proto_memory.h> #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/module.h> #include <linux/static_key.h> #include <linux/skbuff_ref.h> #include <trace/events/tcp.h> /* Refresh clocks of a TCP socket, * ensuring monotically increasing values. */ void tcp_mstamp_refresh(struct tcp_sock *tp) { u64 val = tcp_clock_ns(); tp->tcp_clock_cache = val; tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC); } static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq); __skb_unlink(skb, &sk->sk_write_queue); tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); if (tp->highest_sack == NULL) tp->highest_sack = skb; tp->packets_out += tcp_skb_pcount(skb); if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, tcp_skb_pcount(skb)); tcp_check_space(sk); } /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one * window scaling factor due to loss of precision. * If window has been shrunk, what should we make? It is not clear at all. * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( * Anything in between SND.UNA...SND.UNA+SND.WND also can be already * invalid. OK, let's make this for now: */ static inline __u32 tcp_acceptable_seq(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (!before(tcp_wnd_end(tp), tp->snd_nxt) || (tp->rx_opt.wscale_ok && ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale)))) return tp->snd_nxt; else return tcp_wnd_end(tp); } /* Calculate mss to advertise in SYN segment. * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: * * 1. It is independent of path mtu. * 2. Ideally, it is maximal possible segment size i.e. 65535-40. * 3. For IPv4 it is reasonable to calculate it from maximal MTU of * attached devices, because some buggy hosts are confused by * large MSS. * 4. We do not make 3, we advertise MSS, calculated from first * hop device mtu, but allow to raise it to ip_rt_min_advmss. * This may be overridden via information stored in routing table. * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, * probably even Jumbo". */ static __u16 tcp_advertise_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); const struct dst_entry *dst = __sk_dst_get(sk); int mss = tp->advmss; if (dst) { unsigned int metric = dst_metric_advmss(dst); if (metric < mss) { mss = metric; tp->advmss = mss; } } return (__u16)mss; } /* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */ void tcp_cwnd_restart(struct sock *sk, s32 delta) { struct tcp_sock *tp = tcp_sk(sk); u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 cwnd = tcp_snd_cwnd(tp); tcp_ca_event(sk, CA_EVENT_CWND_RESTART); tp->snd_ssthresh = tcp_current_ssthresh(sk); restart_cwnd = min(restart_cwnd, cwnd); while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd)); tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_cwnd_used = 0; } /* Congestion state accounting after a packet has been sent. */ static void tcp_event_data_sent(struct tcp_sock *tp, struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_jiffies32; if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); tp->lsndtime = now; /* If it is a reply for ato after last received * packet, increase pingpong count. */ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) inet_csk_inc_pingpong_cnt(sk); } /* Account for an ACK we sent. */ static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); if (unlikely(tp->compressed_ack)) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, tp->compressed_ack); tp->compressed_ack = 0; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } if (unlikely(rcv_nxt != tp->rcv_nxt)) return; /* Special ACK sent by DCTCP to reflect ECN */ tcp_dec_quickack_mode(sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } /* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space * will be offered. Store the results in the tp structure. * NOTE: for smooth operation initial space offering should * be a multiple of mss if possible. We assume here that mss >= 1. * This MUST be enforced by all callers. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *__window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); u32 window_clamp = READ_ONCE(*__window_clamp); /* If no clamp set the clamp to the max possible scaled window */ if (window_clamp == 0) window_clamp = (U16_MAX << TCP_MAX_WSCALE); space = min(window_clamp, space); /* Quantize space offering to a multiple of mss if possible. */ if (space > mss) space = rounddown(space, mss); /* NOTE: offering an initial window larger than 32767 * will break some buggy TCP stacks. If the admin tells us * it is likely we could be speaking with such a buggy stack * we will truncate our initial window offering to 32K-1 * unless the remote has sent us a window scaling option, * which we interpret as a sign the remote TCP is not * misinterpreting the window field as a signed quantity. */ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else (*rcv_wnd) = space; if (init_rcv_wnd) *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); *rcv_wscale = 0; if (wscale_ok) { /* Set window scaling on max possible window */ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); space = min_t(u32, space, window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, 0, TCP_MAX_WSCALE); } /* Set the clamp no higher than max representable value */ WRITE_ONCE(*__window_clamp, min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp)); } EXPORT_SYMBOL(tcp_select_initial_window); /* Chose a new window to advertise, update state in tcp_sock for the * socket, and return result with RFC1323 scaling applied. The return * value can be stuffed directly into th->window for an outgoing * frame. */ static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); u32 old_win = tp->rcv_wnd; u32 cur_win, new_win; /* Make the window 0 if we failed to queue the data because we * are out of memory. The window is temporary, so we don't store * it on the socket. */ if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) return 0; cur_win = tcp_receive_window(tp); new_win = __tcp_select_window(sk); if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else * we will not be able to advertise a zero * window in time. --DaveM * * Relax Will Robinson. */ if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) { /* Never shrink the offered window */ if (new_win == 0) NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV); new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } } tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; /* Make sure we do not exceed the maximum possible * scaled window. */ if (!tp->rx_opt.rcv_wscale && READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; /* If we advertise zero window, disable fast path. */ if (new_win == 0) { tp->pred_flags = 0; if (old_win) NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV); } else if (old_win == 0) { NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV); } return new_win; } /* Packet ECN state for a SYN-ACK */ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; else if (tcp_ca_needs_ecn(sk) || tcp_bpf_ca_needs_ecn(sk)) INET_ECN_xmit(sk); } /* Packet ECN state for a SYN. */ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || tcp_ca_needs_ecn(sk) || bpf_needs_ecn; if (!use_ecn) { const struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) use_ecn = true; } tp->ecn_flags = 0; if (use_ecn) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) INET_ECN_xmit(sk); } } static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) { if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) /* tp->ecn_flags are cleared at a later point in time when * SYN ACK is ultimatively being received. */ TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); } static void tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { if (inet_rsk(req)->ecn_ok) th->ece = 1; } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to * be sent. */ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, int tcp_header_len) { struct tcp_sock *tp = tcp_sk(sk); if (tp->ecn_flags & TCP_ECN_OK) { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { INET_ECN_xmit(sk); if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) { tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; th->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } } else if (!tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) th->ece = 1; } } /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { skb->ip_summed = CHECKSUM_PARTIAL; TCP_SKB_CB(skb)->tcp_flags = flags; tcp_skb_pcount_set(skb, 1); TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) seq++; TCP_SKB_CB(skb)->end_seq = seq; } static inline bool tcp_urg_mode(const struct tcp_sock *tp) { return tp->snd_una != tp->snd_up; } #define OPTION_SACK_ADVERTISE BIT(0) #define OPTION_TS BIT(1) #define OPTION_MD5 BIT(2) #define OPTION_WSCALE BIT(3) #define OPTION_FAST_OPEN_COOKIE BIT(8) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) static void smc_options_write(__be32 *ptr, u16 *options) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (unlikely(OPTION_SMC & *options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_EXP << 8) | (TCPOLEN_EXP_SMC_BASE)); *ptr++ = htonl(TCPOPT_SMC_MAGIC); } } #endif } struct tcp_out_options { u16 options; /* bit field of OPTION_* */ u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ struct mptcp_out_options mptcp; }; static void mptcp_options_write(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct tcp_out_options *opts) { #if IS_ENABLED(CONFIG_MPTCP) if (unlikely(OPTION_MPTCP & opts->options)) mptcp_write_options(th, ptr, tp, &opts->mptcp); #endif } #ifdef CONFIG_CGROUP_BPF static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb, enum tcp_synack_type synack_type) { if (unlikely(!skb)) return BPF_WRITE_HDR_TCP_CURRENT_MSS; if (unlikely(synack_type == TCP_SYNACK_COOKIE)) return BPF_WRITE_HDR_TCP_SYNACK_COOKIE; return 0; } /* req, syn_skb and synack_type are used when writing synack */ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts, unsigned int *remaining) { struct bpf_sock_ops_kern sock_ops; int err; if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) || !*remaining) return; /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */ /* init sock_ops */ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB; if (req) { /* The listen "sk" cannot be passed here because * it is not locked. It would not make too much * sense to do bpf_setsockopt(listen_sk) based * on individual connection request also. * * Thus, "req" is passed here and the cgroup-bpf-progs * of the listen "sk" will be run. * * "req" is also used here for fastopen even the "sk" here is * a fullsock "child" sk. It is to keep the behavior * consistent between fastopen and non-fastopen on * the bpf programming side. */ sock_ops.sk = (struct sock *)req; sock_ops.syn_skb = syn_skb; } else { sock_owned_by_me(sk); sock_ops.is_fullsock = 1; sock_ops.sk = sk; } sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); sock_ops.remaining_opt_len = *remaining; /* tcp_current_mss() does not pass a skb */ if (skb) bpf_skops_init_skb(&sock_ops, skb, 0); err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); if (err || sock_ops.remaining_opt_len == *remaining) return; opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len; /* round up to 4 bytes */ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3; *remaining -= opts->bpf_opt_len; } static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts) { u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len; struct bpf_sock_ops_kern sock_ops; int err; if (likely(!max_opt_len)) return; memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB; if (req) { sock_ops.sk = (struct sock *)req; sock_ops.syn_skb = syn_skb; } else { sock_owned_by_me(sk); sock_ops.is_fullsock = 1; sock_ops.sk = sk; } sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); sock_ops.remaining_opt_len = max_opt_len; first_opt_off = tcp_hdrlen(skb) - max_opt_len; bpf_skops_init_skb(&sock_ops, skb, first_opt_off); err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); if (err) nr_written = 0; else nr_written = max_opt_len - sock_ops.remaining_opt_len; if (nr_written < max_opt_len) memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP, max_opt_len - nr_written); } #else static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts, unsigned int *remaining) { } static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts) { } #endif static __be32 *process_tcp_ao_options(struct tcp_sock *tp, const struct tcp_request_sock *tcprsk, struct tcp_out_options *opts, struct tcp_key *key, __be32 *ptr) { #ifdef CONFIG_TCP_AO u8 maclen = tcp_ao_maclen(key->ao_key); if (tcprsk) { u8 aolen = maclen + sizeof(struct tcp_ao_hdr); *ptr++ = htonl((TCPOPT_AO << 24) | (aolen << 16) | (tcprsk->ao_keyid << 8) | (tcprsk->ao_rcv_next)); } else { struct tcp_ao_key *rnext_key; struct tcp_ao_info *ao_info; ao_info = rcu_dereference_check(tp->ao_info, lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk)); rnext_key = READ_ONCE(ao_info->rnext_key); if (WARN_ON_ONCE(!rnext_key)) return ptr; *ptr++ = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key->ao_key) << 16) | (key->ao_key->sndid << 8) | (rnext_key->rcvid)); } opts->hash_location = (__u8 *)ptr; ptr += maclen / sizeof(*ptr); if (unlikely(maclen % sizeof(*ptr))) { memset(ptr, TCPOPT_NOP, sizeof(*ptr)); ptr++; } #endif return ptr; } /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of * TCP options, we learned this through the hard way, so be careful here. * Luckily we can at least blame others for their non-compliance but from * inter-operability perspective it seems that we're somewhat stuck with * the ordering which we have been using if we want to keep working with * those broken things (not that it currently hurts anybody as there isn't * particular reason why the ordering would need to be changed). * * At least SACK_PERM as the first option is known to lead to a disaster * (but it may well be that other scenarios fail similarly). */ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, const struct tcp_request_sock *tcprsk, struct tcp_out_options *opts, struct tcp_key *key) { __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ if (tcp_key_is_md5(key)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); /* overload cookie hash location */ opts->hash_location = (__u8 *)ptr; ptr += 4; } else if (tcp_key_is_ao(key)) { ptr = process_tcp_ao_options(tp, tcprsk, opts, key, ptr); } if (unlikely(opts->mss)) { *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | opts->mss); } if (likely(OPTION_TS & options)) { if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); options &= ~OPTION_SACK_ADVERTISE; } else { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); } *ptr++ = htonl(opts->tsval); *ptr++ = htonl(opts->tsecr); } if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); } if (unlikely(OPTION_WSCALE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->ws); } if (unlikely(opts->num_sack_blocks)) { struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; int this_sack; *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK))); for (this_sack = 0; this_sack < opts->num_sack_blocks; ++this_sack) { *ptr++ = htonl(sp[this_sack].start_seq); *ptr++ = htonl(sp[this_sack].end_seq); } tp->rx_opt.dsack = 0; } if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; u8 *p = (u8 *)ptr; u32 len; /* Fast Open option length */ if (foc->exp) { len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) | TCPOPT_FASTOPEN_MAGIC); p += TCPOLEN_EXP_FASTOPEN_BASE; } else { len = TCPOLEN_FASTOPEN_BASE + foc->len; *p++ = TCPOPT_FASTOPEN; *p++ = len; } memcpy(p, foc->val, foc->len); if ((len & 3) == 2) { p[foc->len] = TCPOPT_NOP; p[foc->len + 1] = TCPOPT_NOP; } ptr += (len + 3) >> 2; } smc_options_write(ptr, &options); mptcp_options_write(th, ptr, tp, opts); } static void smc_set_option(const struct tcp_sock *tp, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc) { if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { opts->options |= OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } } #endif } static void smc_set_option_cond(const struct tcp_sock *tp, const struct inet_request_sock *ireq, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc && ireq->smc_ok) { if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { opts->options |= OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } } #endif } static void mptcp_set_option_cond(const struct request_sock *req, struct tcp_out_options *opts, unsigned int *remaining) { if (rsk_is_mptcp(req)) { unsigned int size; if (mptcp_synack_options(req, &size, &opts->mptcp)) { if (*remaining >= size) { opts->options |= OPTION_MPTCP; *remaining -= size; } } } } /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_key *key) { struct tcp_sock *tp = tcp_sk(sk); unsigned int remaining = MAX_TCP_OPTION_SPACE; struct tcp_fastopen_request *fastopen = tp->fastopen_req; bool timestamps; /* Better than switch (key.type) as it has static branches */ if (tcp_key_is_md5(key)) { timestamps = false; opts->options |= OPTION_MD5; remaining -= TCPOLEN_MD5SIG_ALIGNED; } else { timestamps = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps); if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; remaining -= tcp_ao_len_aligned(key->ao_key); } } /* We always get an MSS option. The option bytes which will be seen in * normal data packets should timestamps be used, must be in the MSS * advertised. But we subtract them from tp->mss_cache so that * calculations in tcp_sendmsg are simpler etc. So account for this * fact here if necessary. If we don't do this correctly, as a * receiver we won't recognize data packets as being full sized when we * should, and thus we won't abide by the delayed ACK rules correctly. * SACKs don't matter, we never delay an ACK when we have any of those * going out. */ opts->mss = tcp_advertise_mss(sk); remaining -= TCPOLEN_MSS_ALIGNED; if (likely(timestamps)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) { opts->ws = tp->rx_opt.rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) remaining -= TCPOLEN_SACKPERM_ALIGNED; } if (fastopen && fastopen->cookie.len >= 0) { u32 need = fastopen->cookie.len; need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE : TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; opts->fastopen_cookie = &fastopen->cookie; remaining -= need; tp->syn_fastopen = 1; tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0; } } smc_set_option(tp, opts, &remaining); if (sk_is_mptcp(sk)) { unsigned int size; if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) { opts->options |= OPTION_MPTCP; remaining -= size; } } bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; } /* Set up TCP options for SYN-ACKs. */ static unsigned int tcp_synack_options(const struct sock *sk, struct request_sock *req, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, const struct tcp_key *key, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; remaining -= TCPOLEN_MD5SIG_ALIGNED; /* We can't fit any SACK blocks in a packet with MD5 + TS * options. There was discussion about disabling SACK * rather than TS in order to fit in better with old, * buggy kernels, but that was deemed to be unnecessary. */ if (synack_type != TCP_SYNACK_COOKIE) ireq->tstamp_ok &= !ireq->sack_ok; } else if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; remaining -= tcp_ao_len_aligned(key->ao_key); ireq->tstamp_ok &= !ireq->sack_ok; } /* We always send an MSS option. */ opts->mss = mss; remaining -= TCPOLEN_MSS_ALIGNED; if (likely(ireq->wscale_ok)) { opts->ws = ireq->rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) + tcp_rsk(req)->ts_off; opts->tsecr = READ_ONCE(req->ts_recent); remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(ireq->sack_ok)) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!ireq->tstamp_ok)) remaining -= TCPOLEN_SACKPERM_ALIGNED; } if (foc != NULL && foc->len >= 0) { u32 need = foc->len; need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE : TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; opts->fastopen_cookie = foc; remaining -= need; } } mptcp_set_option_cond(req, opts, &remaining); smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; } /* Compute TCP options for ESTABLISHED sockets. This is not the * final wire format yet. */ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_key *key) { struct tcp_sock *tp = tcp_sk(sk); unsigned int size = 0; unsigned int eff_sacks; opts->options = 0; /* Better than switch (key.type) as it has static branches */ if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; size += TCPOLEN_MD5SIG_ALIGNED; } else if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; size += tcp_ao_len_aligned(key->ao_key); } if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } /* MPTCP options have precedence over SACK for the limited TCP * option space because a MPTCP connection would be forced to * fall back to regular TCP if a required multipath option is * missing. SACK still gets a chance to use whatever space is * left. */ if (sk_is_mptcp(sk)) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; unsigned int opt_size = 0; if (mptcp_established_options(sk, skb, &opt_size, remaining, &opts->mptcp)) { opts->options |= OPTION_MPTCP; size += opt_size; } } eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED + TCPOLEN_SACK_PERBLOCK)) return size; opts->num_sack_blocks = min_t(unsigned int, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); size += TCPOLEN_SACK_BASE_ALIGNED + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); size = MAX_TCP_OPTION_SPACE - remaining; } return size; } /* TCP SMALL QUEUES (TSQ) * * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) * to reduce RTT and bufferbloat. * We do this using a special skb destructor (tcp_wfree). * * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb * needs to be reallocated in a driver. * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc * * Since transmit from skb destructor is forbidden, we use a tasklet * to process all sockets that eventually need to send more skbs. * We use one tasklet per cpu, with its own queue of sockets. */ struct tsq_tasklet { struct tasklet_struct tasklet; struct list_head head; /* queue of tcp sockets */ }; static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); static void tcp_tsq_write(struct sock *sk) { if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) { struct tcp_sock *tp = tcp_sk(sk); if (tp->lost_out > tp->retrans_out && tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) { tcp_mstamp_refresh(tp); tcp_xmit_retransmit_queue(sk); } tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle, 0, GFP_ATOMIC); } } static void tcp_tsq_handler(struct sock *sk) { bh_lock_sock(sk); if (!sock_owned_by_user(sk)) tcp_tsq_write(sk); else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); bh_unlock_sock(sk); } /* * One tasklet per cpu tries to send more skbs. * We run in tasklet context but need to disable irqs when * transferring tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ static void tcp_tasklet_func(struct tasklet_struct *t) { struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet); LIST_HEAD(list); unsigned long flags; struct list_head *q, *n; struct tcp_sock *tp; struct sock *sk; local_irq_save(flags); list_splice_init(&tsq->head, &list); local_irq_restore(flags); list_for_each_safe(q, n, &list) { tp = list_entry(q, struct tcp_sock, tsq_node); list_del(&tp->tsq_node); sk = (struct sock *)tp; smp_mb__before_atomic(); clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); tcp_tsq_handler(sk); sk_free(sk); } } #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ TCPF_WRITE_TIMER_DEFERRED | \ TCPF_DELACK_TIMER_DEFERRED | \ TCPF_MTU_REDUCED_DEFERRED | \ TCPF_ACK_DEFERRED) /** * tcp_release_cb - tcp release_sock() callback * @sk: socket * * called from release_sock() to perform protocol dependent * actions before socket release. */ void tcp_release_cb(struct sock *sk) { unsigned long flags = smp_load_acquire(&sk->sk_tsq_flags); unsigned long nflags; /* perform an atomic operation only if at least one flag is set */ do { if (!(flags & TCP_DEFERRED_ALL)) return; nflags = flags & ~TCP_DEFERRED_ALL; } while (!try_cmpxchg(&sk->sk_tsq_flags, &flags, nflags)); if (flags & TCPF_TSQ_DEFERRED) { tcp_tsq_write(sk); __sock_put(sk); } if (flags & TCPF_WRITE_TIMER_DEFERRED) { tcp_write_timer_handler(sk); __sock_put(sk); } if (flags & TCPF_DELACK_TIMER_DEFERRED) { tcp_delack_timer_handler(sk); __sock_put(sk); } if (flags & TCPF_MTU_REDUCED_DEFERRED) { inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); __sock_put(sk); } if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk)) tcp_send_ack(sk); } EXPORT_SYMBOL(tcp_release_cb); void __init tcp_tasklet_init(void) { int i; for_each_possible_cpu(i) { struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); INIT_LIST_HEAD(&tsq->head); tasklet_setup(&tsq->tasklet, tcp_tasklet_func); } } /* * Write buffer destructor automatically called from kfree_skb. * We can't xmit new skbs from this context, as we might already * hold qdisc lock. */ void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); unsigned long flags, nval, oval; struct tsq_tasklet *tsq; bool empty; /* Keep one reference on sk_wmem_alloc. * Will be released by sk_free() from here or tcp_tasklet_func() */ WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc)); /* If this softirq is serviced by ksoftirqd, we are likely under stress. * Wait until our queues (qdisc + devices) are drained. * This gives : * - less callbacks to tcp_write_xmit(), reducing stress (batches) * - chance for incoming ACK (processed by another cpu maybe) * to migrate this flow (skb->ooo_okay will be eventually set) */ if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) goto out; oval = smp_load_acquire(&sk->sk_tsq_flags); do { if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) goto out; nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED; } while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval)); /* queue this socket to tasklet queue */ local_irq_save(flags); tsq = this_cpu_ptr(&tsq_tasklet); empty = list_empty(&tsq->head); list_add(&tp->tsq_node, &tsq->head); if (empty) tasklet_schedule(&tsq->tasklet); local_irq_restore(flags); return; out: sk_free(sk); } /* Note: Called under soft irq. * We can call TCP stack right away, unless socket is owned by user. */ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) { struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer); struct sock *sk = (struct sock *)tp; tcp_tsq_handler(sk); sock_put(sk); return HRTIMER_NORESTART; } static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, u64 prior_wstamp) { struct tcp_sock *tp = tcp_sk(sk); if (sk->sk_pacing_status != SK_PACING_NONE) { unsigned long rate = READ_ONCE(sk->sk_pacing_rate); /* Original sch_fq does not pace first 10 MSS * Note that tp->data_segs_out overflows after 2^32 packets, * this is a minor annoyance. */ if (rate != ~0UL && rate && tp->data_segs_out >= 10) { u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate); u64 credit = tp->tcp_wstamp_ns - prior_wstamp; /* take into account OS jitter */ len_ns -= min_t(u64, len_ns / 2, credit); tp->tcp_wstamp_ns += len_ns; } } list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. * All SKB's seen here are completely headerless. It is our * job to build the TCP header, and pass the packet down to * IP so it can do the same plus pass the packet off to the * device. * * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask, u32 rcv_nxt) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; struct tcp_sock *tp; struct tcp_skb_cb *tcb; struct tcp_out_options opts; unsigned int tcp_options_size, tcp_header_size; struct sk_buff *oskb = NULL; struct tcp_key key; struct tcphdr *th; u64 prior_wstamp; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); prior_wstamp = tp->tcp_wstamp_ns; tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); if (clone_it) { oskb = skb; tcp_skb_tsorted_save(oskb) { if (unlikely(skb_cloned(oskb))) skb = pskb_copy(oskb, gfp_mask); else skb = skb_clone(oskb, gfp_mask); } tcp_skb_tsorted_restore(oskb); if (unlikely(!skb)) return -ENOBUFS; /* retransmit skbs might have a non zero value in skb->dev * because skb->dev is aliased with skb->rbnode.rb_left */ skb->dev = NULL; } inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); tcp_get_current_key(sk, &key); if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { tcp_options_size = tcp_syn_options(sk, skb, &opts, &key); } else { tcp_options_size = tcp_established_options(sk, skb, &opts, &key); /* Force a PSH flag on all (GSO) packets to expedite GRO flush * at receiver : This slightly improve GRO performance. * Note that we do not force the PSH flag for non GSO packets, * because they might be sent under high congestion events, * and in this case it is better to delay the delivery of 1-MSS * packets and thus the corresponding ACK packet that would * release the following packet. */ if (tcp_skb_pcount(skb) > 1) tcb->tcp_flags |= TCPHDR_PSH; } tcp_header_size = tcp_options_size + sizeof(struct tcphdr); /* We set skb->ooo_okay to one if this packet can select * a different TX queue than prior packets of this flow, * to avoid self inflicted reorders. * The 'other' queue decision is based on current cpu number * if XPS is enabled, or sk->sk_txhash otherwise. * We can switch to another (and better) queue if: * 1) No packet with payload is in qdisc/device queues. * Delays in TX completion can defeat the test * even if packets were already sent. * 2) Or rtx queue is empty. * This mitigates above case if ACK packets for * all prior packets were already processed. */ skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) || tcp_rtx_queue_empty(sk); /* If we had to use memory reserve to allocate this skb, * this might cause drops if packet is looped back : * Other socket might not have SOCK_MEMALLOC. * Packets not looped back do not care about pfmemalloc. */ skb->pfmemalloc = 0; skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_orphan(skb); skb->sk = sk; skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm)); /* Build TCP header and checksum it. */ th = (struct tcphdr *)skb->data; th->source = inet->inet_sport; th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->tcp_flags); th->check = 0; th->urg_ptr = 0; /* The urg_mode check is necessary during a below snd_una win probe */ if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { if (before(tp->snd_up, tcb->seq + 0x10000)) { th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { th->urg_ptr = htons(0xFFFF); th->urg = 1; } } skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { th->window = htons(tcp_select_window(sk)); tcp_ecn_send(sk, skb, th, tcp_header_size); } else { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ th->window = htons(min(tp->rcv_wnd, 65535U)); } tcp_options_write(th, tp, NULL, &opts, &key); if (tcp_key_is_md5(&key)) { #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ sk_gso_disable(sk); tp->af_specific->calc_md5_hash(opts.hash_location, key.md5_key, sk, skb); #endif } else if (tcp_key_is_ao(&key)) { int err; err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th, opts.hash_location); if (err) { kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); return -ENOMEM; } } /* BPF prog is the last one writing header option */ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, tcp_v6_send_check, tcp_v4_send_check, sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); tp->bytes_sent += skb->len - tcp_header_size; } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); tp->segs_out += tcp_skb_pcount(skb); skb_set_hash_from_sk(skb, sk); /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */ /* Cleanup our debris for IP stacks */ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); tcp_add_tx_delay(skb, tp); err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit, inet6_csk_xmit, ip_queue_xmit, sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { tcp_enter_cwr(sk); err = net_xmit_eval(err); } if (!err && oskb) { tcp_update_skb_after_send(sk, oskb, prior_wstamp); tcp_rate_skb_sent(sk, oskb); } return err; } static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, tcp_sk(sk)->rcv_nxt); } /* This routine just queues the buffer for sending. * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, * otherwise socket can stall. */ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Advance write_seq and place onto the write_queue. */ WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); } /* Initialize TSO segments for a packet. */ static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs; if (skb->len <= mss_now) { /* Avoid the costly divide in the normal * non-TSO case. */ TCP_SKB_CB(skb)->tcp_gso_size = 0; tcp_skb_pcount_set(skb, 1); return 1; } TCP_SKB_CB(skb)->tcp_gso_size = mss_now; tso_segs = DIV_ROUND_UP(skb->len, mss_now); tcp_skb_pcount_set(skb, tso_segs); return tso_segs; } /* Pcount in the middle of the write queue got changed, we need to do various * tweaks to fix counters */ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) { struct tcp_sock *tp = tcp_sk(sk); tp->packets_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) tp->lost_out -= decr; /* Reno case is special. Sigh... */ if (tcp_is_reno(tp) && decr > 0) tp->sacked_out -= min_t(u32, tp->sacked_out, decr); if (tp->lost_skb_hint && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) tp->lost_cnt_hint -= decr; tcp_verify_left_out(tp); } static bool tcp_has_tx_tstamp(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->txstamp_ack || (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP); } static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) { struct skb_shared_info *shinfo = skb_shinfo(skb); if (unlikely(tcp_has_tx_tstamp(skb)) && !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) { struct skb_shared_info *shinfo2 = skb_shinfo(skb2); u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP; shinfo->tx_flags &= ~tsflags; shinfo2->tx_flags |= tsflags; swap(shinfo->tskey, shinfo2->tskey); TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack; TCP_SKB_CB(skb)->txstamp_ack = 0; } } static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) { TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor; TCP_SKB_CB(skb)->eor = 0; } /* Insert buff after skb on the write or rtx queue of sk. */ static void tcp_insert_write_queue_after(struct sk_buff *skb, struct sk_buff *buff, struct sock *sk, enum tcp_queue tcp_queue) { if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) __skb_queue_after(&sk->sk_write_queue, skb, buff); else tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); } /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int old_factor; long limit; int nlen; u8 flags; if (WARN_ON(len > skb->len)) return -EINVAL; DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb. * We need some allowance to not penalize applications setting small * SO_SNDBUF values. * Also allow first and last skb in retransmit queue to be split. */ limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE); if (unlikely((sk->sk_wmem_queued >> 1) > limit && tcp_queue != TCP_FRAG_IN_WRITE_QUEUE && skb != tcp_rtx_queue_head(sk) && skb != tcp_rtx_queue_tail(sk))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG); return -ENOMEM; } if (skb_unclone_keeptruesize(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ buff = tcp_stream_alloc_skb(sk, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); nlen = skb->len - len; buff->truesize += nlen; skb->truesize -= nlen; /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; tcp_skb_fragment_eor(skb, buff); skb_split(skb, buff, len); skb_set_delivery_time(buff, skb->tstamp, true); tcp_fragment_tstamp(skb, buff); old_factor = tcp_skb_pcount(skb); /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(skb, mss_now); tcp_set_skb_tso_segs(buff, mss_now); /* Update delivered info for the new segment */ TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx; /* If this packet has been sent out already, we must * adjust the various packet counters. */ if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) { int diff = old_factor - tcp_skb_pcount(skb) - tcp_skb_pcount(buff); if (diff) tcp_adjust_pcount(sk, skb, diff); } /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; } /* This is similar to __pskb_pull_tail(). The difference is that pulled * data is not copied, but immediately discarded. */ static int __pskb_trim_head(struct sk_buff *skb, int len) { struct skb_shared_info *shinfo; int i, k, eat; DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); eat = len; k = 0; shinfo = skb_shinfo(skb); for (i = 0; i < shinfo->nr_frags; i++) { int size = skb_frag_size(&shinfo->frags[i]); if (size <= eat) { skb_frag_unref(skb, i); eat -= size; } else { shinfo->frags[k] = shinfo->frags[i]; if (eat) { skb_frag_off_add(&shinfo->frags[k], eat); skb_frag_size_sub(&shinfo->frags[k], eat); eat = 0; } k++; } } shinfo->nr_frags = k; skb->data_len -= len; skb->len = skb->data_len; return len; } /* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { u32 delta_truesize; if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; delta_truesize = __pskb_trim_head(skb, len); TCP_SKB_CB(skb)->seq += len; skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); if (!skb_zcopy_pure(skb)) sk_mem_uncharge(sk, delta_truesize); /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); return 0; } /* Calculate MSS not accounting any TCP options. */ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) mss_now = tp->rx_opt.mss_clamp; /* Now subtract optional transport overhead */ mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ mss_now = max(mss_now, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss)); return mss_now; } /* Calculate MSS. Not accounting for SACKs here. */ int tcp_mtu_to_mss(struct sock *sk, int pmtu) { /* Subtract TCP options size, not including SACKs */ return __tcp_mtu_to_mss(sk, pmtu) - (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); } EXPORT_SYMBOL(tcp_mtu_to_mss); /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); return mss + tp->tcp_header_len + icsk->icsk_ext_hdr_len + icsk->icsk_af_ops->net_header_len; } EXPORT_SYMBOL(tcp_mss_to_mtu); /* MTU probing init per socket */ void tcp_mtup_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss)); icsk->icsk_mtup.probe_size = 0; if (icsk->icsk_mtup.enabled) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } EXPORT_SYMBOL(tcp_mtup_init); /* This function synchronize snd mss to current pmtu/exthdr set. tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts for TCP options, but includes only bare TCP header. tp->rx_opt.mss_clamp is mss negotiated at connection setup. It is minimum of user_mss and mss received with SYN. It also does not include TCP options. inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function. tp->mss_cache is current effective sending mss, including all tcp options except for SACKs. It is evaluated, taking into account current pmtu, but never exceeds tp->rx_opt.mss_clamp. NOTE1. rfc1122 clearly states that advertised MSS DOES NOT include either tcp or ip options. NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache are READ ONLY outside this function. --ANK (980731) */ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; if (icsk->icsk_mtup.search_high > pmtu) icsk->icsk_mtup.search_high = pmtu; mss_now = tcp_mtu_to_mss(sk, pmtu); mss_now = tcp_bound_to_half_wnd(tp, mss_now); /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; if (icsk->icsk_mtup.enabled) mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); tp->mss_cache = mss_now; return mss_now; } EXPORT_SYMBOL(tcp_sync_mss); /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. */ unsigned int tcp_current_mss(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct dst_entry *dst = __sk_dst_get(sk); u32 mss_now; unsigned int header_len; struct tcp_out_options opts; struct tcp_key key; mss_now = tp->mss_cache; if (dst) { u32 mtu = dst_mtu(dst); if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } tcp_get_current_key(sk, &key); header_len = tcp_established_options(sk, NULL, &opts, &key) + sizeof(struct tcphdr); /* The mss_cache is sized based on tp->tcp_header_len, which assumes * some common options. If this is an odd packet (because we have SACK * blocks etc) then our calculated header_len will be different, and * we have to adjust mss_now correspondingly */ if (header_len != tp->tcp_header_len) { int delta = (int) header_len - tp->tcp_header_len; mss_now -= delta; } return mss_now; } /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. * As additional protections, we do not touch cwnd in retransmission phases, * and if application hit its sndbuf limit recently. */ static void tcp_cwnd_application_limited(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { /* Limited by application or receiver window. */ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 win_used = max(tp->snd_cwnd_used, init_win); if (win_used < tcp_snd_cwnd(tp)) { tp->snd_ssthresh = tcp_current_ssthresh(sk); tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1); } tp->snd_cwnd_used = 0; } tp->snd_cwnd_stamp = tcp_jiffies32; } static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); /* Track the strongest available signal of the degree to which the cwnd * is fully utilized. If cwnd-limited then remember that fact for the * current window. If not cwnd-limited then track the maximum number of * outstanding packets in the current window. (If cwnd-limited then we * chose to not update tp->max_packets_out to avoid an extra else * clause with no functional impact.) */ if (!before(tp->snd_una, tp->cwnd_usage_seq) || is_cwnd_limited || (!tp->is_cwnd_limited && tp->packets_out > tp->max_packets_out)) { tp->is_cwnd_limited = is_cwnd_limited; tp->max_packets_out = tp->packets_out; tp->cwnd_usage_seq = tp->snd_nxt; } if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_jiffies32; } else { /* Network starves. */ if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) && (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); /* The following conditions together indicate the starvation * is caused by insufficient sender buffer: * 1) just sent some data (see tcp_write_xmit) * 2) not cwnd limited (this else condition) * 3) no more data to send (tcp_write_queue_empty()) * 4) application is hitting buffer limit (SOCK_NOSPACE) */ if (tcp_write_queue_empty(sk) && sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); } } /* Minshall's variant of the Nagle send check. */ static bool tcp_minshall_check(const struct tcp_sock *tp) { return after(tp->snd_sml, tp->snd_una) && !after(tp->snd_sml, tp->snd_nxt); } /* Update snd_sml if this skb is under mss * Note that a TSO packet might end with a sub-mss segment * The test is really : * if ((skb->len % mss) != 0) * tp->snd_sml = TCP_SKB_CB(skb)->end_seq; * But we can avoid doing the divide again given we already have * skb_pcount = skb->len / mss_now */ static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, const struct sk_buff *skb) { if (skb->len < tcp_skb_pcount(skb) * mss_now) tp->snd_sml = TCP_SKB_CB(skb)->end_seq; } /* Return false, if packet can be sent now without violation Nagle's rules: * 1. It is full sized. (provided by caller in %partial bool) * 2. Or it contains FIN. (already checked by caller) * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. */ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, int nonagle) { return partial && ((nonagle & TCP_NAGLE_CORK) || (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } /* Return how many segs we'd like on a TSO packet, * depending on current pacing rate, and how close the peer is. * * Rationale is: * - For close peers, we rather send bigger packets to reduce * cpu costs, because occasional losses will be repaired fast. * - For long distance/rtt flows, we would like to get ACK clocking * with 1 ACK per ms. * * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting * in bigger TSO bursts. We we cut the RTT-based allowance in half * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance * is below 1500 bytes after 6 * ~500 usec = 3ms. */ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, int min_tso_segs) { unsigned long bytes; u32 r; bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log); if (r < BITS_PER_TYPE(sk->sk_gso_max_size)) bytes += sk->sk_gso_max_size >> r; bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size); return max_t(u32, bytes / mss_now, min_tso_segs); } /* Return the number of segments we want in the skb we are transmitting. * See if congestion control module wants to decide; otherwise, autosize. */ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; u32 min_tso, tso_segs; min_tso = ca_ops->min_tso_segs ? ca_ops->min_tso_segs(sk) : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); return min_t(u32, tso_segs, sk->sk_gso_max_segs); } /* Returns the portion of skb which can be sent right away */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, unsigned int mss_now, unsigned int max_segs, int nonagle) { const struct tcp_sock *tp = tcp_sk(sk); u32 partial, needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; max_len = mss_now * max_segs; if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) return max_len; needed = min(skb->len, window); if (max_len <= needed) return max_len; partial = needed % mss_now; /* If last segment is not a full MSS, check if Nagle rules allow us * to include this last segment in this skb. * Otherwise, we'll split the skb at last MSS boundary */ if (tcp_nagle_check(partial != 0, tp, nonagle)) return needed - partial; return needed; } /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ static u32 tcp_cwnd_test(const struct tcp_sock *tp) { u32 in_flight, cwnd, halfcwnd; in_flight = tcp_packets_in_flight(tp); cwnd = tcp_snd_cwnd(tp); if (in_flight >= cwnd) return 0; /* For better scheduling, ensure we have at least * 2 GSO packets in flight. */ halfcwnd = max(cwnd >> 1, 1U); return min(halfcwnd, cwnd - in_flight); } /* Initialize TSO state of a skb. * This must be invoked the first time we consider transmitting * SKB onto the wire. */ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) return tcp_set_skb_tso_segs(skb, mss_now); return tso_segs; } /* Return true if the Nagle test allows this packet to be * sent now. */ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned int cur_mss, int nonagle) { /* Nagle rule does not apply to frames, which sit in the middle of the * write_queue (they have no chances to get new data). * * This is implemented in the callers, where they modify the 'nonagle' * argument based upon the location of SKB in the send queue. */ if (nonagle & TCP_NAGLE_PUSH) return true; /* Don't use the nagle rule for urgent data (or for the final FIN). */ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return true; if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) return true; return false; } /* Does at least the first segment of SKB fit into the send window? */ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned int cur_mss) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (skb->len > cur_mss) end_seq = TCP_SKB_CB(skb)->seq + cur_mss; return !after(end_seq, tcp_wnd_end(tp)); } /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet * which is put after SKB on the list. It is very much like * tcp_fragment() except that it may make several kinds of assumptions * in order to speed up the splitting operation. In particular, we * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now, gfp_t gfp) { int nlen = skb->len - len; struct sk_buff *buff; u8 flags; /* All of a TSO frame must be composed of paged data. */ DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len); buff = tcp_stream_alloc_skb(sk, gfp, true); if (unlikely(!buff)) return -ENOMEM; skb_copy_decrypted(buff, skb); mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; skb->truesize -= nlen; /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; tcp_skb_fragment_eor(skb, buff); skb_split(skb, buff, len); tcp_fragment_tstamp(skb, buff); /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(skb, mss_now); tcp_set_skb_tso_segs(buff, mss_now); /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE); return 0; } /* Try to defer sending, if possible, in order to minimize the amount * of TSO splitting we do. View it as a kind of TSO Nagle test. * * This algorithm is from John Heffner. */ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, bool *is_cwnd_limited, bool *is_rwnd_limited, u32 max_segs) { const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *head; int win_divisor; s64 delta; if (icsk->icsk_ca_state >= TCP_CA_Recovery) goto send_now; /* Avoid bursty behavior by allowing defer * only if the last write was recent (1 ms). * Note that tp->tcp_wstamp_ns can be in the future if we have * packets waiting in a qdisc or device for EDT delivery. */ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC; if (delta > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); BUG_ON(tcp_skb_pcount(skb) <= 1); BUG_ON(tcp_snd_cwnd(tp) <= in_flight); send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* From in_flight test above, we know that cwnd > in_flight. */ cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache; limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ if (limit >= max_segs * tp->mss_cache) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); if (win_divisor) { u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache); /* If at least some fraction of a window is available, * just use it. */ chunk /= win_divisor; if (limit >= chunk) goto send_now; } else { /* Different approach, try not to defer past a single * ACK. Receiver should ACK every other full sized * frame, so if we have space for more than 3 frames * then send now. */ if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache) goto send_now; } /* TODO : use tsorted_sent_queue ? */ head = tcp_rtx_queue_head(sk); if (!head) goto send_now; delta = tp->tcp_clock_cache - head->tstamp; /* If next ACK is likely to come too late (half srtt), do not defer */ if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0) goto send_now; /* Ok, it looks like it is advisable to defer. * Three cases are tracked : * 1) We are cwnd-limited * 2) We are rwnd-limited * 3) We are application limited. */ if (cong_win < send_win) { if (cong_win <= skb->len) { *is_cwnd_limited = true; return true; } } else { if (send_win <= skb->len) { *is_rwnd_limited = true; return true; } } /* If this packet won't get more data, do not wait. */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || TCP_SKB_CB(skb)->eor) goto send_now; return true; send_now: return false; } static inline void tcp_mtu_check_reprobe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); u32 interval; s32 delta; interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval); delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp; if (unlikely(delta >= interval * HZ)) { int mss = tcp_current_mss(sk); /* Update current search range */ icsk->icsk_mtup.probe_size = 0; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); /* Update probe time stamp */ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } } static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) { struct sk_buff *skb, *next; skb = tcp_send_head(sk); tcp_for_write_queue_from_safe(skb, next, sk) { if (len <= skb->len) break; if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb) || !skb_pure_zcopy_same(skb, next)) return false; len -= skb->len; } return true; } static int tcp_clone_payload(struct sock *sk, struct sk_buff *to, int probe_size) { skb_frag_t *lastfrag = NULL, *fragto = skb_shinfo(to)->frags; int i, todo, len = 0, nr_frags = 0; const struct sk_buff *skb; if (!sk_wmem_schedule(sk, to->truesize + probe_size)) return -ENOMEM; skb_queue_walk(&sk->sk_write_queue, skb) { const skb_frag_t *fragfrom = skb_shinfo(skb)->frags; if (skb_headlen(skb)) return -EINVAL; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, fragfrom++) { if (len >= probe_size) goto commit; todo = min_t(int, skb_frag_size(fragfrom), probe_size - len); len += todo; if (lastfrag && skb_frag_page(fragfrom) == skb_frag_page(lastfrag) && skb_frag_off(fragfrom) == skb_frag_off(lastfrag) + skb_frag_size(lastfrag)) { skb_frag_size_add(lastfrag, todo); continue; } if (unlikely(nr_frags == MAX_SKB_FRAGS)) return -E2BIG; skb_frag_page_copy(fragto, fragfrom); skb_frag_off_copy(fragto, fragfrom); skb_frag_size_set(fragto, todo); nr_frags++; lastfrag = fragto++; } } commit: WARN_ON_ONCE(len != probe_size); for (i = 0; i < nr_frags; i++) skb_frag_ref(to, i); skb_shinfo(to)->nr_frags = nr_frags; to->truesize += probe_size; to->len += probe_size; to->data_len += probe_size; __skb_header_release(to); return 0; } /* tcp_mtu_probe() and tcp_grow_skb() can both eat an skb (src) if * all its payload was moved to another one (dst). * Make sure to transfer tcp_flags, eor, and tstamp. */ static void tcp_eat_one_skb(struct sock *sk, struct sk_buff *dst, struct sk_buff *src) { TCP_SKB_CB(dst)->tcp_flags |= TCP_SKB_CB(src)->tcp_flags; TCP_SKB_CB(dst)->eor = TCP_SKB_CB(src)->eor; tcp_skb_collapse_tstamp(dst, src); tcp_unlink_write_queue(src, sk); tcp_wmem_free_skb(sk, src); } /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing * changes resulting in larger path MTUs. * * Returns 0 if we should wait to probe (no cwnd available), * 1 if a probe was sent, * -1 otherwise */ static int tcp_mtu_probe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *nskb, *next; struct net *net = sock_net(sk); int probe_size; int size_needed; int copy, len; int mss_now; int interval; /* Not currently probing/verifying, * not in recovery, * have enough cwnd, and * not SACKing (the variable headers throw things off) */ if (likely(!icsk->icsk_mtup.enabled || icsk->icsk_mtup.probe_size || inet_csk(sk)->icsk_ca_state != TCP_CA_Open || tcp_snd_cwnd(tp) < 11 || tp->rx_opt.num_sacks || tp->rx_opt.dsack)) return -1; /* Use binary search for probe_size between tcp_mss_base, * and current mss_clamp. if (search_high - search_low) * smaller than a threshold, backoff from probing. */ mss_now = tcp_current_mss(sk); probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high + icsk->icsk_mtup.search_low) >> 1); size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low; /* When misfortune happens, we are reprobing actively, * and then reprobe timer has expired. We stick with current * probing process by not resetting search range to its orignal. */ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) { /* Check whether enough time has elaplased for * another round of probing. */ tcp_mtu_check_reprobe(sk); return -1; } /* Have enough data in the send queue to probe? */ if (tp->write_seq - tp->snd_nxt < size_needed) return -1; if (tp->snd_wnd < size_needed) return -1; if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp))) return 0; /* Do we need to wait to drain cwnd? With none in flight, don't stall */ if (tcp_packets_in_flight(tp) + 2 > tcp_snd_cwnd(tp)) { if (!tcp_packets_in_flight(tp)) return -1; else return 0; } if (!tcp_can_coalesce_send_queue_head(sk, probe_size)) return -1; /* We're allowed to probe. Build it now. */ nskb = tcp_stream_alloc_skb(sk, GFP_ATOMIC, false); if (!nskb) return -1; /* build the payload, and be prepared to abort if this fails. */ if (tcp_clone_payload(sk, nskb, probe_size)) { tcp_skb_tsorted_anchor_cleanup(nskb); consume_skb(nskb); return -1; } sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); skb_copy_decrypted(nskb, skb); mptcp_skb_ext_copy(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; tcp_insert_write_queue_before(nskb, skb, sk); tcp_highest_sack_replace(sk, skb, nskb); len = 0; tcp_for_write_queue_from_safe(skb, next, sk) { copy = min_t(int, skb->len, probe_size - len); if (skb->len <= copy) { tcp_eat_one_skb(sk, nskb, skb); } else { TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH); __pskb_trim_head(skb, copy); tcp_set_skb_tso_segs(skb, mss_now); TCP_SKB_CB(skb)->seq += copy; } len += copy; if (len >= probe_size) break; } tcp_init_tso_segs(nskb, nskb->len); /* We're ready to send. If this fails, the probe will * be resegmented into mss-sized pieces by tcp_write_xmit(). */ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending * effectively two packets. */ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1); tcp_event_new_data_sent(sk, nskb); icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; return 1; } return -1; } static bool tcp_pacing_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_needs_internal_pacing(sk)) return false; if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache) return false; if (!hrtimer_is_queued(&tp->pacing_timer)) { hrtimer_start(&tp->pacing_timer, ns_to_ktime(tp->tcp_wstamp_ns), HRTIMER_MODE_ABS_PINNED_SOFT); sock_hold(sk); } return true; } static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk) { const struct rb_node *node = sk->tcp_rtx_queue.rb_node; /* No skb in the rtx queue. */ if (!node) return true; /* Only one skb in rtx queue. */ return !node->rb_left && !node->rb_right; } /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * (These limits are doubled for retransmits) * This allows for : * - better RTT estimation and ACK scheduling * - faster recovery * - high rates * Alas, some drivers / subsystems require a fair amount * of queued bytes to ensure line rate. * One example is wifi aggregation (802.11 AMPDU) */ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, unsigned int factor) { unsigned long limit; limit = max_t(unsigned long, 2 * skb->truesize, READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift)); if (sk->sk_pacing_status == SK_PACING_NONE) limit = min_t(unsigned long, limit, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); limit <<= factor; if (static_branch_unlikely(&tcp_tx_delay_enabled) && tcp_sk(sk)->tcp_tx_delay) { u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) * tcp_sk(sk)->tcp_tx_delay; /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we * approximate our needs assuming an ~100% skb->truesize overhead. * USEC_PER_SEC is approximated by 2^20. * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift. */ extra_bytes >>= (20 - 1); limit += extra_bytes; } if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send skb if rtx queue is empty or has one skb. * No need to wait for TX completion to call us back, * after softirq/tasklet schedule. * This helps when TX completions are delayed too much. */ if (tcp_rtx_queue_empty_or_single_skb(sk)) return false; set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); /* It is possible TX completion already happened * before we set TSQ_THROTTLED, so we must * test again the condition. */ smp_mb__after_atomic(); if (refcount_read(&sk->sk_wmem_alloc) > limit) return true; } return false; } static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) { const u32 now = tcp_jiffies32; enum tcp_chrono old = tp->chrono_type; if (old > TCP_CHRONO_UNSPEC) tp->chrono_stat[old - 1] += now - tp->chrono_start; tp->chrono_start = now; tp->chrono_type = new; } void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type) { struct tcp_sock *tp = tcp_sk(sk); /* If there are multiple conditions worthy of tracking in a * chronograph then the highest priority enum takes precedence * over the other conditions. So that if something "more interesting" * starts happening, stop the previous chrono and start a new one. */ if (type > tp->chrono_type) tcp_chrono_set(tp, type); } void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) { struct tcp_sock *tp = tcp_sk(sk); /* There are multiple conditions worthy of tracking in a * chronograph, so that the highest priority enum takes * precedence over the other conditions (see tcp_chrono_start). * If a condition stops, we only stop chrono tracking if * it's the "most interesting" or current chrono we are * tracking and starts busy chrono if we have pending data. */ if (tcp_rtx_and_write_queues_empty(sk)) tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); else if (type == tp->chrono_type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); } /* First skb in the write queue is smaller than ideal packet size. * Check if we can move payload from the second skb in the queue. */ static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount) { struct sk_buff *next_skb = skb->next; unsigned int nlen; if (tcp_skb_is_last(sk, skb)) return; if (!tcp_skb_can_collapse(skb, next_skb)) return; nlen = min_t(u32, amount, next_skb->len); if (!nlen || !skb_shift(skb, next_skb, nlen)) return; TCP_SKB_CB(skb)->end_seq += nlen; TCP_SKB_CB(next_skb)->seq += nlen; if (!next_skb->len) { /* In case FIN is set, we need to update end_seq */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; tcp_eat_one_skb(sk, skb, next_skb); } } /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. * * LARGESEND note: !tcp_urg_mode is overkill, only frames between * snd_up-64k-mss .. snd_up cannot be large. However, taking into * account rare use of URG, this is not a big flaw. * * Send at most one packet when push_one > 0. Temporarily ignore * cwnd limit to force at most one packet out when push_one == 2. * Returns true, if no segments are in flight and we have queued segments, * but cannot send anything now because of SWS or another problem. */ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; u32 cwnd_quota, max_segs; int result; bool is_cwnd_limited = false, is_rwnd_limited = false; sent_pkts = 0; tcp_mstamp_refresh(tp); if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); if (!result) { return false; } else if (result > 0) { sent_pkts = 1; } } max_segs = tcp_tso_segs(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; int missing_bytes; if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ tp->tcp_wstamp_ns = tp->tcp_clock_cache; skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ } if (tcp_pacing_check(sk)) break; cwnd_quota = tcp_cwnd_test(tp); if (!cwnd_quota) { if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; else break; } cwnd_quota = min(cwnd_quota, max_segs); missing_bytes = cwnd_quota * mss_now - skb->len; if (missing_bytes > 0) tcp_grow_skb(sk, skb, missing_bytes); tso_segs = tcp_set_skb_tso_segs(skb, mss_now); if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { is_rwnd_limited = true; break; } if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) break; } else { if (!push_one && tcp_tso_should_defer(sk, skb, &is_cwnd_limited, &is_rwnd_limited, max_segs)) break; } limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, cwnd_quota, nonagle); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; if (tcp_small_queue_check(sk, skb, 0)) break; /* Argh, we hit an empty skb(), presumably a thread * is sleeping in sendmsg()/sk_stream_wait_memory(). * We do not want to send a pure-ack packet and have * a strange looking rtx queue with empty packet(s). */ if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) break; if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; repair: /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ tcp_event_new_data_sent(sk, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts += tcp_skb_pcount(skb); if (push_one) break; } if (is_rwnd_limited) tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED); else tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp)); if (likely(sent_pkts || is_cwnd_limited)) tcp_cwnd_validate(sk, is_cwnd_limited); if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk, false); return false; } return !tp->packets_out && !tcp_write_queue_empty(sk); } bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 timeout, timeout_us, rto_delta_us; int early_retrans; /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ if (rcu_access_pointer(tp->fastopen_rsk)) return false; early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans); /* Schedule a loss probe in 2*RTT for SACK capable connections * not in loss recovery, that are either limited by cwnd or application. */ if ((early_retrans != 3 && early_retrans != 4) || !tp->packets_out || !tcp_is_sack(tp) || (icsk->icsk_ca_state != TCP_CA_Open && icsk->icsk_ca_state != TCP_CA_CWR)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account * for delayed ack when there's one outstanding packet. If no RTT * sample is available then probe after TCP_TIMEOUT_INIT. */ if (tp->srtt_us) { timeout_us = tp->srtt_us >> 2; if (tp->packets_out == 1) timeout_us += tcp_rto_min_us(sk); else timeout_us += TCP_TIMEOUT_MIN_US; timeout = usecs_to_jiffies(timeout_us); } else { timeout = TCP_TIMEOUT_INIT; } /* If the RTO formula yields an earlier time, then use that time. */ rto_delta_us = advancing_rto ? jiffies_to_usecs(inet_csk(sk)->icsk_rto) : tcp_rto_delta_us(sk); /* How far in future is RTO? */ if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); return true; } /* Thanks to skb fast clones, we can detect if a prior transmit of * a packet is still in a qdisc or driver queue. * In this case, there is very little point doing a retransmit ! */ static bool skb_still_in_host_queue(struct sock *sk, const struct sk_buff *skb) { if (unlikely(skb_fclone_busy(sk, skb))) { set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); smp_mb__after_atomic(); if (skb_fclone_busy(sk, skb)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; } } return false; } /* When probe timeout (PTO) fires, try send a new segment if possible, else * retransmit the last segment. */ void tcp_send_loss_probe(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int pcount; int mss = tcp_current_mss(sk); /* At most one outstanding TLP */ if (tp->tlp_high_seq) goto rearm_timer; tp->tlp_retrans = 0; skb = tcp_send_head(sk); if (skb && tcp_snd_wnd_test(tp, skb, mss)) { pcount = tp->packets_out; tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); if (tp->packets_out > pcount) goto probe_sent; goto rearm_timer; } skb = skb_rb_last(&sk->tcp_rtx_queue); if (unlikely(!skb)) { WARN_ONCE(tp->packets_out, "invalid inflight: %u state %u cwnd %u mss %d\n", tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss); inet_csk(sk)->icsk_pending = 0; return; } if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; pcount = tcp_skb_pcount(skb); if (WARN_ON(!pcount)) goto rearm_timer; if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, (pcount - 1) * mss, mss, GFP_ATOMIC))) goto rearm_timer; skb = skb_rb_next(skb); } if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; if (__tcp_retransmit_skb(sk, skb, 1)) goto rearm_timer; tp->tlp_retrans = 1; probe_sent: /* Record snd_nxt for loss detection. */ tp->tlp_high_seq = tp->snd_nxt; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ inet_csk(sk)->icsk_pending = 0; rearm_timer: tcp_rearm_rto(sk); } /* Push out any pending frames which were held back due to * TCP_CORK or attempt at coalescing tiny packets. * The socket must be locked by the caller. */ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle) { /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and * all will be happy. */ if (unlikely(sk->sk_state == TCP_CLOSE)) return; if (tcp_write_xmit(sk, cur_mss, nonagle, 0, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. */ void tcp_push_one(struct sock *sk, unsigned int mss_now) { struct sk_buff *skb = tcp_send_head(sk); BUG_ON(!skb || skb->len < mss_now); tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); } /* This function returns the amount that we can raise the * usable window based on the following constraints * * 1. The window can never be shrunk once it is offered (RFC 793) * 2. We limit memory per socket * * RFC 1122: * "the suggested [SWS] avoidance algorithm for the receiver is to keep * RECV.NEXT + RCV.WIN fixed until: * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" * * i.e. don't raise the right edge of the window until you can raise * it at least MSS bytes. * * Unfortunately, the recommended algorithm breaks header prediction, * since header prediction assumes th->window stays fixed. * * Strictly speaking, keeping th->window fixed violates the receiver * side SWS prevention criteria. The problem is that under this rule * a stream of single byte packets will cause the right side of the * window to always advance by a single byte. * * Of course, if the sender implements sender side SWS prevention * then this will not be a problem. * * BSD seems to make the following compromise: * * If the free space is less than the 1/4 of the maximum * space available and the free space is less than 1/2 mss, * then set the window to 0. * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ] * Otherwise, just prevent the window from shrinking * and from being larger than the largest representable value. * * This prevents incremental opening of the window in the regime * where TCP is limited by the speed of the reader side taking * data out of the TCP receive queue. It does nothing about * those cases where the window is constrained on the sender side * because the pipeline is full. * * BSD also seems to "accidentally" limit itself to windows that are a * multiple of MSS, at least until the free space gets quite small. * This would appear to be a side effect of the mbuf implementation. * Combining these two algorithms results in the observed behavior * of having a fixed window size at almost all times. * * Below we obtain similar behavior by forcing the offered window to * a multiple of the mss when it is feasible to do so. * * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. * Regular options like TIMESTAMP are taken into account. */ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct * but may be worse for the performance because of rcv_mss * fluctuations. --SAW 1998/11/1 */ int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); int allowed_space = tcp_full_space(sk); int full_space, window; if (sk_is_mptcp(sk)) mptcp_space(sk, &free_space, &allowed_space); full_space = min_t(int, tp->window_clamp, allowed_space); if (unlikely(mss > full_space)) { mss = full_space; if (mss <= 0) return 0; } /* Only allow window shrink if the sysctl is enabled and we have * a non-zero scaling factor in effect. */ if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale) goto shrink_window_allowed; /* do not allow window to shrink */ if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; if (tcp_under_memory_pressure(sk)) tcp_adjust_rcv_ssthresh(sk); /* free_space might become our new window, make sure we don't * increase it due to wscale. */ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); /* if free space is less than mss estimate, or is below 1/16th * of the maximum allowed, try to move to zero-window, else * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and * new incoming data is dropped due to memory limits. * With large window, mss test triggers way too late in order * to announce zero window in time before rmem limit kicks in. */ if (free_space < (allowed_space >> 4) || free_space < mss) return 0; } if (free_space > tp->rcv_ssthresh) free_space = tp->rcv_ssthresh; /* Don't do rounding if we are using window scaling, since the * scaled window will not line up with the MSS boundary anyway. */ if (tp->rx_opt.rcv_wscale) { window = free_space; /* Advertise enough space so that it won't get scaled away. * Import case: prevent zero window announcement if * 1<<rcv_wscale > mss. */ window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale)); } else { window = tp->rcv_wnd; /* Get the largest window that is a nice multiple of mss. * Window clamp already applied above. * If our current window offering is within 1 mss of the * free space we just keep it. This prevents the divide * and multiply from happening most of the time. * We also don't do any window rounding when the free space * is too small. */ if (window <= free_space - mss || window > free_space) window = rounddown(free_space, mss); else if (mss == full_space && free_space > window + (full_space >> 1)) window = free_space; } return window; shrink_window_allowed: /* new window should always be an exact multiple of scaling factor */ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; if (tcp_under_memory_pressure(sk)) tcp_adjust_rcv_ssthresh(sk); /* if free space is too low, return a zero window */ if (free_space < (allowed_space >> 4) || free_space < mss || free_space < (1 << tp->rx_opt.rcv_wscale)) return 0; } if (free_space > tp->rcv_ssthresh) { free_space = tp->rcv_ssthresh; /* new window should always be an exact multiple of scaling factor * * For this case, we ALIGN "up" (increase free_space) because * we know free_space is not zero here, it has been reduced from * the memory-based limit, and rcv_ssthresh is not a hard limit * (unlike sk_rcvbuf). */ free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale)); } return free_space; } void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb) { if (unlikely(tcp_has_tx_tstamp(next_skb))) { const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb); struct skb_shared_info *shinfo = skb_shinfo(skb); shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; shinfo->tskey = next_shinfo->tskey; TCP_SKB_CB(skb)->txstamp_ack |= TCP_SKB_CB(next_skb)->txstamp_ack; } } /* Collapses two adjacent SKB's during retransmission. */ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = skb_rb_next(skb); int next_skb_size; next_skb_size = next_skb->len; BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size)) return false; tcp_highest_sack_replace(sk, next_skb, skb); /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; /* Merge over control information. This moves PSH/FIN etc. over */ TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags; /* All done, get rid of second SKB and account for it so * packet counting does not break. */ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; /* changed transmit queue under us so clear hints */ tcp_clear_retrans_hints_partial(tp); if (next_skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = skb; tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb)); tcp_skb_collapse_tstamp(skb, next_skb); tcp_rtx_queue_unlink_and_free(next_skb, sk); return true; } /* Check if coalescing SKBs is legal. */ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) { if (tcp_skb_pcount(skb) > 1) return false; if (skb_cloned(skb)) return false; /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; return true; } /* Collapse packets in the retransmit queue to make to create * less packets on the wire. This is only done on retransmission. */ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, int space) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = to, *tmp; bool first = true; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)) return; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; skb_rbtree_walk_from_safe(skb, tmp) { if (!tcp_can_collapse(sk, skb)) break; if (!tcp_skb_can_collapse(to, skb)) break; space -= skb->len; if (first) { first = false; continue; } if (space < 0) break; if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) break; if (!tcp_collapse_retrans(sk, to)) break; } } /* This retransmits one SKB. Policy decisions and retransmit queue * state updates are done by the caller. Returns non-zero if an * error occurred which prevented the send. */ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int cur_mss; int diff, len, err; int avail_wnd; /* Inconclusive MTU probe */ if (icsk->icsk_mtup.probe_size) icsk->icsk_mtup.probe_size = 0; if (skb_still_in_host_queue(sk, skb)) return -EBUSY; start: if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; TCP_SKB_CB(skb)->seq++; goto start; } if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) { WARN_ON_ONCE(1); return -EINVAL; } if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return -ENOMEM; } if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ cur_mss = tcp_current_mss(sk); avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* If receiver has shrunk his window, and skb is out of * new window, do not retransmit it. The exception is the * case, when window is shrunk to zero. In this case * our retransmit of one segment serves as a zero window probe. */ if (avail_wnd <= 0) { if (TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; avail_wnd = cur_mss; } len = cur_mss * segs; if (len > avail_wnd) { len = rounddown(avail_wnd, cur_mss); if (!len) len = avail_wnd; } if (skb->len > len) { if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; diff = tcp_skb_pcount(skb); tcp_set_skb_tso_segs(skb, cur_mss); diff -= tcp_skb_pcount(skb); if (diff) tcp_adjust_pcount(sk, skb, diff); avail_wnd = min_t(int, avail_wnd, cur_mss); if (skb->len < avail_wnd) tcp_retrans_try_collapse(sk, skb, avail_wnd); } /* RFC3168, section 6.1.1.1. ECN fallback */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); /* Update global and local TCP statistics. */ segs = tcp_skb_pcount(skb); TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); tp->total_retrans += segs; tp->bytes_retrans += skb->len; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom * beyond what csum_start can cover. */ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; tcp_skb_tsorted_save(skb) { nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); if (nskb) { nskb->dev = NULL; err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC); } else { err = -ENOBUFS; } } tcp_skb_tsorted_restore(skb); if (!err) { tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns); tcp_rate_skb_sent(sk, skb); } } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, TCP_SKB_CB(skb)->seq, segs, err); if (likely(!err)) { trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); } /* To avoid taking spuriously low RTT samples based on a timestamp * for a transmit that never happened, always mark EVER_RETRANS */ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; return err; } int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { struct tcp_sock *tp = tcp_sk(sk); int err = __tcp_retransmit_skb(sk, skb, segs); if (err == 0) { #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { net_dbg_ratelimited("retrans_out leaked\n"); } #endif TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); } /* Save stamp of the first (attempted) retransmit. */ if (!tp->retrans_stamp) tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb); if (tp->undo_retrans < 0) tp->undo_retrans = 0; tp->undo_retrans += tcp_skb_pcount(skb); return err; } /* This gets called after a retransmit timeout, and the initially * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either * we've sent it all or the congestion window limit is reached. */ void tcp_xmit_retransmit_queue(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb, *rtx_head, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); bool rearm_timer = false; u32 max_segs; int mib_idx; if (!tp->packets_out) return; rtx_head = tcp_rtx_queue_head(sk); skb = tp->retransmit_skb_hint ?: rtx_head; max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); skb_rbtree_walk_from(skb) { __u8 sacked; int segs; if (tcp_pacing_check(sk)) break; /* we could do better than to assign each time */ if (!hole) tp->retransmit_skb_hint = skb; segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp); if (segs <= 0) break; sacked = TCP_SKB_CB(skb)->sacked; /* In case tcp_shift_skb_data() have aggregated large skbs, * we need to make sure not sending too bigs TSO packets */ segs = min_t(int, segs, max_segs); if (tp->retrans_out >= tp->lost_out) { break; } else if (!(sacked & TCPCB_LOST)) { if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) hole = skb; continue; } else { if (icsk->icsk_ca_state != TCP_CA_Loss) mib_idx = LINUX_MIB_TCPFASTRETRANS; else mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; } if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; if (tcp_small_queue_check(sk, skb, 1)) break; if (tcp_retransmit_skb(sk, skb, segs)) break; NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb)); if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) rearm_timer = true; } if (rearm_timer) tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } /* We allow to exceed memory limits for FIN packets to expedite * connection tear down and (memory) recovery. * Otherwise tcp_send_fin() could be tempted to either delay FIN * or even be forced to close flow without any FIN. * In general, we want to allow one skb per socket to avoid hangs * with edge trigger epoll() */ void sk_forced_mem_schedule(struct sock *sk, int size) { int delta, amt; delta = size - sk->sk_forward_alloc; if (delta <= 0) return; amt = sk_mem_pages(delta); sk_forward_alloc_add(sk, amt << PAGE_SHIFT); sk_memory_allocated_add(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_charge_skmem(sk->sk_memcg, amt, gfp_memcg_charge() | __GFP_NOFAIL); } /* Send a FIN. The caller locks the socket for us. * We should try to send a FIN packet really hard, but eventually give up. */ void tcp_send_fin(struct sock *sk) { struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); /* Optimization, tack on the FIN if we have one skb in write queue and * this skb was not yet sent, or we are under memory pressure. * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ tskb = tail; if (!tskb && tcp_under_memory_pressure(sk)) tskb = skb_rb_last(&sk->tcp_rtx_queue); if (tskb) { TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; if (!tail) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have * if FIN had been sent. This is because retransmit path * does not change tp->snd_nxt. */ WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1); return; } } else { skb = alloc_skb_fclone(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (unlikely(!skb)) return; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); } /* We get here when a process closes a file descriptor (either due to * an explicit close() or as a byproduct of exit()'ing) and there * was unread data in the receive queue. This behavior is recommended * by RFC 2525, section 2.17. -DaveM */ void tcp_send_active_reset(struct sock *sk, gfp_t priority, enum sk_rst_reason reason) { struct sk_buff *skb; TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); /* NOTE: No TCP options attached and we never retransmit this. */ skb = alloc_skb(MAX_TCP_HEADER, priority); if (!skb) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); tcp_mstamp_refresh(tcp_sk(sk)); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); /* skb of trace_tcp_send_reset() keeps the skb that caused RST, * skb here is different to the troublesome skb, so use NULL */ trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED); } /* Send a crossed SYN-ACK during socket establishment. * WARNING: This routine must only be called when we have already sent * a SYN packet that crossed the incoming SYN that caused this routine * to get called. If this assumption fails then the initial rcv_wnd * and rcv_wscale values will not be correct. */ int tcp_send_synack(struct sock *sk) { struct sk_buff *skb; skb = tcp_rtx_queue_head(sk); if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err("%s: wrong queue state\n", __func__); return -EFAULT; } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { struct sk_buff *nskb; tcp_skb_tsorted_save(skb) { nskb = skb_copy(skb, GFP_ATOMIC); } tcp_skb_tsorted_restore(skb); if (!nskb) return -ENOMEM; INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); tcp_highest_sack_replace(sk, skb, nskb); tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = nskb; } TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; tcp_ecn_send_synack(sk, skb); } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } /** * tcp_make_synack - Allocate one skb and build a SYNACK packet. * @sk: listener socket * @dst: dst entry attached to the SYNACK. It is consumed and caller * should not use it again. * @req: request_sock pointer * @foc: cookie for tcp fast open * @synack_type: Type of synack to prepare * @syn_skb: SYN packet just received. It could be NULL for rtx case. */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); struct tcp_out_options opts; struct tcp_key key = {}; struct sk_buff *skb; int tcp_header_size; struct tcphdr *th; int mss; u64 now; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; } /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); switch (synack_type) { case TCP_SYNACK_NORMAL: skb_set_owner_w(skb, req_to_sk(req)); break; case TCP_SYNACK_COOKIE: /* Under synflood, we do not attach skb to a socket, * to avoid false sharing. */ break; case TCP_SYNACK_FASTOPEN: /* sk is a const pointer, because we want to express multiple * cpu might call us concurrently. * sk->sk_wmem_alloc in an atomic, we can promote to rw. */ skb_set_owner_w(skb, (struct sock *)sk); break; } skb_dst_set(skb, dst); mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) skb_set_delivery_time(skb, cookie_init_timestamp(req, now), true); else #endif { skb_set_delivery_time(skb, now, true); if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_lock(); #endif if (tcp_rsk_used_ao(req)) { #ifdef CONFIG_TCP_AO struct tcp_ao_key *ao_key = NULL; u8 keyid = tcp_rsk(req)->ao_keyid; ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req), keyid, -1); /* If there is no matching key - avoid sending anything, * especially usigned segments. It could try harder and lookup * for another peer-matching key, but the peer has requested * ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here. */ if (unlikely(!ao_key)) { rcu_read_unlock(); kfree_skb(skb); net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n", keyid); return NULL; } key.ao_key = ao_key; key.type = TCP_KEY_AO; #endif } else { #ifdef CONFIG_TCP_MD5SIG key.md5_key = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), PKT_HASH_TYPE_L4); /* bpf program will be interested in the tcp_flags */ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK; tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &key, foc, synack_type, syn_skb) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); th = (struct tcphdr *)skb->data; memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; tcp_ecn_make_synack(req, th); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; skb->mark = ireq->ir_mark; skb->ip_summed = CHECKSUM_PARTIAL; th->seq = htonl(tcp_rsk(req)->snt_isn); /* XXX data is queued and acked as is. No buffer/window check */ th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write(th, NULL, tcp_rsk(req), &opts, &key); th->doff = (tcp_header_size >> 2); TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); /* Okay, we have all we need - do the md5 hash if needed */ if (tcp_key_is_md5(&key)) { #ifdef CONFIG_TCP_MD5SIG tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, key.md5_key, req_to_sk(req), skb); #endif } else if (tcp_key_is_ao(&key)) { #ifdef CONFIG_TCP_AO tcp_rsk(req)->af_specific->ao_synack_hash(opts.hash_location, key.ao_key, req, skb, opts.hash_location - (u8 *)th, 0); #endif } #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_unlock(); #endif bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, synack_type, &opts); skb_set_delivery_time(skb, now, true); tcp_add_tx_delay(skb, tp); return skb; } EXPORT_SYMBOL(tcp_make_synack); static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); if (ca_key == TCP_CA_UNSPEC) return; rcu_read_lock(); ca = tcp_ca_find_key(ca_key); if (likely(ca && bpf_try_module_get(ca, ca->owner))) { bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; } rcu_read_unlock(); } /* Do all connect socket setups that can be done AF independent. */ static void tcp_connect_init(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; u32 rcv_wnd; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. */ tp->tcp_header_len = sizeof(struct tcphdr); if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps)) tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; tcp_ao_connect_init(sk); /* If user gave his TCP_MAXSEG, record it to clamp */ if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; tp->max_window = 0; tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); tcp_ca_dst_init(sk, dst); if (!tp->window_clamp) WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW)); tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); tcp_initialize_rcv_mss(sk); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) WRITE_ONCE(tp->window_clamp, tcp_full_space(sk)); rcv_wnd = tcp_rwnd_init_bpf(sk); if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); tcp_select_initial_window(sk, tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling), &rcv_wscale, rcv_wnd); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; WRITE_ONCE(sk->sk_err, 0); sock_reset_flag(sk, SOCK_DONE); tp->snd_wnd = 0; tcp_init_wl(tp, 0); tcp_write_queue_purge(sk); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; WRITE_ONCE(tp->snd_nxt, tp->write_seq); if (likely(!tp->repair)) tp->rcv_nxt = 0; else tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->end_seq += skb->len; __skb_header_release(skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); WRITE_ONCE(tp->write_seq, tcb->end_seq); tp->packets_out += tcp_skb_pcount(skb); } /* Build and send a SYN with data and (cached) Fast Open cookie. However, * queue a data-only packet after the regular SYN, such that regular SYNs * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges * only the SYN sequence, the data are retransmitted in the first ACK. * If cookie is not cached or other error occurs, falls back to send a * regular SYN with Fast Open cookie request option. */ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; struct page_frag *pfrag = sk_page_frag(sk); struct sk_buff *syn_data; int space, err = 0; tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie)) goto fallback; /* MSS for SYN-data is based on cached MSS and bounded by PMTU and * user-MSS. Reserve maximum option space for middleboxes that add * private TCP options. The cost is reduced data space in SYN :( */ tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp); /* Sync mss_cache after updating the mss_clamp */ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; space = min_t(size_t, space, fo->size); if (space && !skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE), pfrag, sk->sk_allocation)) goto fallback; syn_data = tcp_stream_alloc_skb(sk, sk->sk_allocation, false); if (!syn_data) goto fallback; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); if (space) { space = min_t(size_t, space, pfrag->size - pfrag->offset); space = tcp_wmem_schedule(sk, space); } if (space) { space = copy_page_from_iter(pfrag->page, pfrag->offset, space, &fo->data->msg_iter); if (unlikely(!space)) { tcp_skb_tsorted_anchor_cleanup(syn_data); kfree_skb(syn_data); goto fallback; } skb_fill_page_desc(syn_data, 0, pfrag->page, pfrag->offset, space); page_ref_inc(pfrag->page); pfrag->offset += space; skb_len_add(syn_data, space); skb_zcopy_set(syn_data, fo->uarg, NULL); } /* No more data pending in inet_wait_for_connect() */ if (space == fo->size) fo->data = NULL; fo->copied = space; tcp_connect_queue_skb(sk, syn_data); if (syn_data->len) tcp_chrono_start(sk, TCP_CHRONO_BUSY); err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true); /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) * we keep in write queue in case of a retransmit, as we * also have the SYN packet (with no data) in the same queue. */ TCP_SKB_CB(syn_data)->seq++; TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; if (!err) { tp->syn_data = (fo->copied > 0); tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } /* data was not sent, put it in write_queue */ __skb_queue_tail(&sk->sk_write_queue, syn_data); tp->packets_out -= tcp_skb_pcount(syn_data); fallback: /* Send a regular SYN with Fast Open cookie request option */ if (fo->cookie.len > 0) fo->cookie.len = 0; err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); if (err) tp->syn_fastopen = 0; done: fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ return err; } /* Build a SYN and send it off. */ int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int err; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL); #if defined(CONFIG_TCP_MD5SIG) && defined(CONFIG_TCP_AO) /* Has to be checked late, after setting daddr/saddr/ops. * Return error if the peer has both a md5 and a tcp-ao key * configured as this is ambiguous. */ if (unlikely(rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)))) { bool needs_ao = !!tp->af_specific->ao_lookup(sk, sk, -1, -1); bool needs_md5 = !!tp->af_specific->md5_lookup(sk, sk); struct tcp_ao_info *ao_info; ao_info = rcu_dereference_check(tp->ao_info, lockdep_sock_is_held(sk)); if (ao_info) { /* This is an extra check: tcp_ao_required() in * tcp_v{4,6}_parse_md5_keys() should prevent adding * md5 keys on ao_required socket. */ needs_ao |= ao_info->ao_required; WARN_ON_ONCE(ao_info->ao_required && needs_md5); } if (needs_md5 && needs_ao) return -EKEYREJECTED; /* If we have a matching md5 key and no matching tcp-ao key * then free up ao_info if allocated. */ if (needs_md5) { tcp_ao_destroy_sock(sk, false); } else if (needs_ao) { tcp_clear_md5_list(sk); kfree(rcu_replace_pointer(tp->md5sig_info, NULL, lockdep_sock_is_held(sk))); } } #endif #ifdef CONFIG_TCP_AO if (unlikely(rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held(sk)))) { /* Don't allow connecting if ao is configured but no * matching key is found. */ if (!tp->af_specific->ao_lookup(sk, sk, -1, -1)) return -EKEYREJECTED; } #endif if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ tcp_connect_init(sk); if (unlikely(tp->repair)) { tcp_finish_connect(sk, NULL); return 0; } buff = tcp_stream_alloc_skb(sk, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tcp_mstamp_refresh(tp); tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); if (err == -ECONNREFUSED) return err; /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ WRITE_ONCE(tp->snd_nxt, tp->write_seq); tp->pushed_seq = tp->write_seq; buff = tcp_send_head(sk); if (unlikely(buff)) { WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq); tp->pushed_seq = TCP_SKB_CB(buff)->seq; } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0; } EXPORT_SYMBOL(tcp_connect); u32 tcp_delack_max(const struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); u32 delack_max = inet_csk(sk)->icsk_delack_max; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) { u32 rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); u32 delack_from_rto_min = max_t(int, 1, rto_min - 1); delack_max = min_t(u32, delack_max, delack_from_rto_min); } return delack_max; } /* Send out a delayed ack, the caller does the policy checking * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() * for details. */ void tcp_send_delayed_ack(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int ato = icsk->icsk_ack.ato; unsigned long timeout; if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; if (inet_csk_in_pingpong_mode(sk) || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; /* Slow path, intersegment interval is "high". */ /* If some rtt estimate is known, use it to bound delayed ack. * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * directly. */ if (tp->srtt_us) { int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), TCP_DELACK_MIN); if (rtt < max_ato) max_ato = rtt; } ato = min(ato, max_ato); } ato = min_t(u32, ato, tcp_delack_max(sk)); /* Stay within the limit we were given */ timeout = jiffies + ato; /* Use new timeout only if there wasn't a older one earlier. */ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { /* If delack timer is about to expire, send ACK now. */ if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { tcp_send_ack(sk); return; } if (!time_before(timeout, icsk->icsk_ack.timeout)) timeout = icsk->icsk_ack.timeout; } icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; icsk->icsk_ack.timeout = timeout; sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } /* This routine sends an ack and also updates the window. */ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) { struct sk_buff *buff; /* If we have been reset, we may not send again. */ if (sk->sk_state == TCP_CLOSE) return; /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. */ buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (unlikely(!buff)) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned long delay; delay = TCP_DELACK_MAX << icsk->icsk_ack.retry; if (delay < TCP_RTO_MAX) icsk->icsk_ack.retry++; inet_csk_schedule_ack(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 */ skb_set_tcp_pure_ack(buff); /* Send it off, this clears delayed acks for us. */ __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt); } EXPORT_SYMBOL_GPL(__tcp_send_ack); void tcp_send_ack(struct sock *sk) { __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); } /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. * * Question: what should we make while urgent mode? * 4.4BSD forces sending single byte of data. We cannot send * out of window data, because we have SND.NXT==SND.MAX... * * Current solution: to send TWO zero-length segments in urgent mode: * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is * out-of-date with SND.UNA-1 to probe window. */ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; /* We don't queue it, tcp_transmit_skb() sets ownership. */ skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (!skb) return -1; /* Reserve space for headers and set control bits. */ skb_reserve(skb, MAX_TCP_HEADER); /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just * send it. */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } /* Called from setsockopt( ... TCP_REPAIR ) */ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; tcp_mstamp_refresh(tcp_sk(sk)); tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } /* Initiate keepalive or window probe from timer. */ int tcp_write_wakeup(struct sock *sk, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; if (sk->sk_state == TCP_CLOSE) return -1; skb = tcp_send_head(sk); if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { int err; unsigned int mss = tcp_current_mss(sk); unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; /* We are probing the opening of a window * but the window size is != 0 * must have been a result SWS avoidance ( sender ) */ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) tcp_event_new_data_sent(sk, skb); return err; } else { if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) tcp_xmit_probe_skb(sk, 1, mib); return tcp_xmit_probe_skb(sk, 0, mib); } } /* A window probe timeout has occurred. If window is not closed send * a partial packet else a zero probe. */ void tcp_send_probe0(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); unsigned long timeout; int err; err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; icsk->icsk_probes_tstamp = 0; return; } icsk->icsk_probes_out++; if (err <= 0) { if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; timeout = tcp_probe0_when(sk, TCP_RTO_MAX); } else { /* If packet was not sent due to local congestion, * Let senders fight for local resources conservatively. */ timeout = TCP_RESOURCE_PROBE_INTERVAL; } timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout); tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) { const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; struct flowi fl; int res; /* Paired with WRITE_ONCE() in sock_setsockopt() */ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash()); res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, NULL); if (!res) { TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); if (unlikely(tcp_passive_fastopen(sk))) { /* sk has const attribute because listeners are lockless. * However in this case, we are dealing with a passive fastopen * socket thus we can change total_retrans value. */ tcp_sk_rw(sk)->total_retrans++; } trace_tcp_retransmit_synack(sk, req); } return res; } EXPORT_SYMBOL(tcp_rtx_synack);
21 19 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM page_pool #if !defined(_TRACE_PAGE_POOL_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGE_POOL_H #include <linux/types.h> #include <linux/tracepoint.h> #include <trace/events/mmflags.h> #include <net/page_pool/types.h> TRACE_EVENT(page_pool_release, TP_PROTO(const struct page_pool *pool, s32 inflight, u32 hold, u32 release), TP_ARGS(pool, inflight, hold, release), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(s32, inflight) __field(u32, hold) __field(u32, release) __field(u64, cnt) ), TP_fast_assign( __entry->pool = pool; __entry->inflight = inflight; __entry->hold = hold; __entry->release = release; __entry->cnt = pool->destroy_cnt; ), TP_printk("page_pool=%p inflight=%d hold=%u release=%u cnt=%llu", __entry->pool, __entry->inflight, __entry->hold, __entry->release, __entry->cnt) ); TRACE_EVENT(page_pool_state_release, TP_PROTO(const struct page_pool *pool, const struct page *page, u32 release), TP_ARGS(pool, page, release), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(const struct page *, page) __field(u32, release) __field(unsigned long, pfn) ), TP_fast_assign( __entry->pool = pool; __entry->page = page; __entry->release = release; __entry->pfn = page_to_pfn(page); ), TP_printk("page_pool=%p page=%p pfn=0x%lx release=%u", __entry->pool, __entry->page, __entry->pfn, __entry->release) ); TRACE_EVENT(page_pool_state_hold, TP_PROTO(const struct page_pool *pool, const struct page *page, u32 hold), TP_ARGS(pool, page, hold), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(const struct page *, page) __field(u32, hold) __field(unsigned long, pfn) ), TP_fast_assign( __entry->pool = pool; __entry->page = page; __entry->hold = hold; __entry->pfn = page_to_pfn(page); ), TP_printk("page_pool=%p page=%p pfn=0x%lx hold=%u", __entry->pool, __entry->page, __entry->pfn, __entry->hold) ); TRACE_EVENT(page_pool_update_nid, TP_PROTO(const struct page_pool *pool, int new_nid), TP_ARGS(pool, new_nid), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(int, pool_nid) __field(int, new_nid) ), TP_fast_assign( __entry->pool = pool; __entry->pool_nid = pool->p.nid; __entry->new_nid = new_nid; ), TP_printk("page_pool=%p pool_nid=%d new_nid=%d", __entry->pool, __entry->pool_nid, __entry->new_nid) ); #endif /* _TRACE_PAGE_POOL_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 3 2 1 1 1 3 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 // SPDX-License-Identifier: GPL-2.0-or-later /* * Joystick device driver for the input driver suite. * * Copyright (c) 1999-2002 Vojtech Pavlik * Copyright (c) 1999 Colin Van Dyke */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/io.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/joystick.h> #include <linux/input.h> #include <linux/kernel.h> #include <linux/major.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/device.h> #include <linux/cdev.h> MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>"); MODULE_DESCRIPTION("Joystick device interfaces"); MODULE_LICENSE("GPL"); #define JOYDEV_MINOR_BASE 0 #define JOYDEV_MINORS 16 #define JOYDEV_BUFFER_SIZE 64 struct joydev { int open; struct input_handle handle; wait_queue_head_t wait; struct list_head client_list; spinlock_t client_lock; /* protects client_list */ struct mutex mutex; struct device dev; struct cdev cdev; bool exist; struct js_corr corr[ABS_CNT]; struct JS_DATA_SAVE_TYPE glue; int nabs; int nkey; __u16 keymap[KEY_MAX - BTN_MISC + 1]; __u16 keypam[KEY_MAX - BTN_MISC + 1]; __u8 absmap[ABS_CNT]; __u8 abspam[ABS_CNT]; __s16 abs[ABS_CNT]; }; struct joydev_client { struct js_event buffer[JOYDEV_BUFFER_SIZE]; int head; int tail; int startup; spinlock_t buffer_lock; /* protects access to buffer, head and tail */ struct fasync_struct *fasync; struct joydev *joydev; struct list_head node; }; static int joydev_correct(int value, struct js_corr *corr) { switch (corr->type) { case JS_CORR_NONE: break; case JS_CORR_BROKEN: value = value > corr->coef[0] ? (value < corr->coef[1] ? 0 : ((corr->coef[3] * (value - corr->coef[1])) >> 14)) : ((corr->coef[2] * (value - corr->coef[0])) >> 14); break; default: return 0; } return clamp(value, -32767, 32767); } static void joydev_pass_event(struct joydev_client *client, struct js_event *event) { struct joydev *joydev = client->joydev; /* * IRQs already disabled, just acquire the lock */ spin_lock(&client->buffer_lock); client->buffer[client->head] = *event; if (client->startup == joydev->nabs + joydev->nkey) { client->head++; client->head &= JOYDEV_BUFFER_SIZE - 1; if (client->tail == client->head) client->startup = 0; } spin_unlock(&client->buffer_lock); kill_fasync(&client->fasync, SIGIO, POLL_IN); } static void joydev_event(struct input_handle *handle, unsigned int type, unsigned int code, int value) { struct joydev *joydev = handle->private; struct joydev_client *client; struct js_event event; switch (type) { case EV_KEY: if (code < BTN_MISC || value == 2) return; event.type = JS_EVENT_BUTTON; event.number = joydev->keymap[code - BTN_MISC]; event.value = value; break; case EV_ABS: event.type = JS_EVENT_AXIS; event.number = joydev->absmap[code]; event.value = joydev_correct(value, &joydev->corr[event.number]); if (event.value == joydev->abs[event.number]) return; joydev->abs[event.number] = event.value; break; default: return; } event.time = jiffies_to_msecs(jiffies); rcu_read_lock(); list_for_each_entry_rcu(client, &joydev->client_list, node) joydev_pass_event(client, &event); rcu_read_unlock(); wake_up_interruptible(&joydev->wait); } static int joydev_fasync(int fd, struct file *file, int on) { struct joydev_client *client = file->private_data; return fasync_helper(fd, file, on, &client->fasync); } static void joydev_free(struct device *dev) { struct joydev *joydev = container_of(dev, struct joydev, dev); input_put_device(joydev->handle.dev); kfree(joydev); } static void joydev_attach_client(struct joydev *joydev, struct joydev_client *client) { spin_lock(&joydev->client_lock); list_add_tail_rcu(&client->node, &joydev->client_list); spin_unlock(&joydev->client_lock); } static void joydev_detach_client(struct joydev *joydev, struct joydev_client *client) { spin_lock(&joydev->client_lock); list_del_rcu(&client->node); spin_unlock(&joydev->client_lock); synchronize_rcu(); } static void joydev_refresh_state(struct joydev *joydev) { struct input_dev *dev = joydev->handle.dev; int i, val; for (i = 0; i < joydev->nabs; i++) { val = input_abs_get_val(dev, joydev->abspam[i]); joydev->abs[i] = joydev_correct(val, &joydev->corr[i]); } } static int joydev_open_device(struct joydev *joydev) { int retval; retval = mutex_lock_interruptible(&joydev->mutex); if (retval) return retval; if (!joydev->exist) retval = -ENODEV; else if (!joydev->open++) { retval = input_open_device(&joydev->handle); if (retval) joydev->open--; else joydev_refresh_state(joydev); } mutex_unlock(&joydev->mutex); return retval; } static void joydev_close_device(struct joydev *joydev) { mutex_lock(&joydev->mutex); if (joydev->exist && !--joydev->open) input_close_device(&joydev->handle); mutex_unlock(&joydev->mutex); } /* * Wake up users waiting for IO so they can disconnect from * dead device. */ static void joydev_hangup(struct joydev *joydev) { struct joydev_client *client; spin_lock(&joydev->client_lock); list_for_each_entry(client, &joydev->client_list, node) kill_fasync(&client->fasync, SIGIO, POLL_HUP); spin_unlock(&joydev->client_lock); wake_up_interruptible(&joydev->wait); } static int joydev_release(struct inode *inode, struct file *file) { struct joydev_client *client = file->private_data; struct joydev *joydev = client->joydev; joydev_detach_client(joydev, client); kfree(client); joydev_close_device(joydev); return 0; } static int joydev_open(struct inode *inode, struct file *file) { struct joydev *joydev = container_of(inode->i_cdev, struct joydev, cdev); struct joydev_client *client; int error; client = kzalloc(sizeof(struct joydev_client), GFP_KERNEL); if (!client) return -ENOMEM; spin_lock_init(&client->buffer_lock); client->joydev = joydev; joydev_attach_client(joydev, client); error = joydev_open_device(joydev); if (error) goto err_free_client; file->private_data = client; stream_open(inode, file); return 0; err_free_client: joydev_detach_client(joydev, client); kfree(client); return error; } static int joydev_generate_startup_event(struct joydev_client *client, struct input_dev *input, struct js_event *event) { struct joydev *joydev = client->joydev; int have_event; spin_lock_irq(&client->buffer_lock); have_event = client->startup < joydev->nabs + joydev->nkey; if (have_event) { event->time = jiffies_to_msecs(jiffies); if (client->startup < joydev->nkey) { event->type = JS_EVENT_BUTTON | JS_EVENT_INIT; event->number = client->startup; event->value = !!test_bit(joydev->keypam[event->number], input->key); } else { event->type = JS_EVENT_AXIS | JS_EVENT_INIT; event->number = client->startup - joydev->nkey; event->value = joydev->abs[event->number]; } client->startup++; } spin_unlock_irq(&client->buffer_lock); return have_event; } static int joydev_fetch_next_event(struct joydev_client *client, struct js_event *event) { int have_event; spin_lock_irq(&client->buffer_lock); have_event = client->head != client->tail; if (have_event) { *event = client->buffer[client->tail++]; client->tail &= JOYDEV_BUFFER_SIZE - 1; } spin_unlock_irq(&client->buffer_lock); return have_event; } /* * Old joystick interface */ static ssize_t joydev_0x_read(struct joydev_client *client, struct input_dev *input, char __user *buf) { struct joydev *joydev = client->joydev; struct JS_DATA_TYPE data; int i; spin_lock_irq(&input->event_lock); /* * Get device state */ for (data.buttons = i = 0; i < 32 && i < joydev->nkey; i++) data.buttons |= test_bit(joydev->keypam[i], input->key) ? (1 << i) : 0; data.x = (joydev->abs[0] / 256 + 128) >> joydev->glue.JS_CORR.x; data.y = (joydev->abs[1] / 256 + 128) >> joydev->glue.JS_CORR.y; /* * Reset reader's event queue */ spin_lock(&client->buffer_lock); client->startup = 0; client->tail = client->head; spin_unlock(&client->buffer_lock); spin_unlock_irq(&input->event_lock); if (copy_to_user(buf, &data, sizeof(struct JS_DATA_TYPE))) return -EFAULT; return sizeof(struct JS_DATA_TYPE); } static inline int joydev_data_pending(struct joydev_client *client) { struct joydev *joydev = client->joydev; return client->startup < joydev->nabs + joydev->nkey || client->head != client->tail; } static ssize_t joydev_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct joydev_client *client = file->private_data; struct joydev *joydev = client->joydev; struct input_dev *input = joydev->handle.dev; struct js_event event; int retval; if (!joydev->exist) return -ENODEV; if (count < sizeof(struct js_event)) return -EINVAL; if (count == sizeof(struct JS_DATA_TYPE)) return joydev_0x_read(client, input, buf); if (!joydev_data_pending(client) && (file->f_flags & O_NONBLOCK)) return -EAGAIN; retval = wait_event_interruptible(joydev->wait, !joydev->exist || joydev_data_pending(client)); if (retval) return retval; if (!joydev->exist) return -ENODEV; while (retval + sizeof(struct js_event) <= count && joydev_generate_startup_event(client, input, &event)) { if (copy_to_user(buf + retval, &event, sizeof(struct js_event))) return -EFAULT; retval += sizeof(struct js_event); } while (retval + sizeof(struct js_event) <= count && joydev_fetch_next_event(client, &event)) { if (copy_to_user(buf + retval, &event, sizeof(struct js_event))) return -EFAULT; retval += sizeof(struct js_event); } return retval; } /* No kernel lock - fine */ static __poll_t joydev_poll(struct file *file, poll_table *wait) { struct joydev_client *client = file->private_data; struct joydev *joydev = client->joydev; poll_wait(file, &joydev->wait, wait); return (joydev_data_pending(client) ? (EPOLLIN | EPOLLRDNORM) : 0) | (joydev->exist ? 0 : (EPOLLHUP | EPOLLERR)); } static int joydev_handle_JSIOCSAXMAP(struct joydev *joydev, void __user *argp, size_t len) { __u8 *abspam; int i; int retval = 0; len = min(len, sizeof(joydev->abspam)); /* Validate the map. */ abspam = memdup_user(argp, len); if (IS_ERR(abspam)) return PTR_ERR(abspam); for (i = 0; i < len && i < joydev->nabs; i++) { if (abspam[i] > ABS_MAX) { retval = -EINVAL; goto out; } } memcpy(joydev->abspam, abspam, len); for (i = 0; i < joydev->nabs; i++) joydev->absmap[joydev->abspam[i]] = i; out: kfree(abspam); return retval; } static int joydev_handle_JSIOCSBTNMAP(struct joydev *joydev, void __user *argp, size_t len) { __u16 *keypam; int i; int retval = 0; if (len % sizeof(*keypam)) return -EINVAL; len = min(len, sizeof(joydev->keypam)); /* Validate the map. */ keypam = memdup_user(argp, len); if (IS_ERR(keypam)) return PTR_ERR(keypam); for (i = 0; i < (len / 2) && i < joydev->nkey; i++) { if (keypam[i] > KEY_MAX || keypam[i] < BTN_MISC) { retval = -EINVAL; goto out; } } memcpy(joydev->keypam, keypam, len); for (i = 0; i < joydev->nkey; i++) joydev->keymap[joydev->keypam[i] - BTN_MISC] = i; out: kfree(keypam); return retval; } static int joydev_ioctl_common(struct joydev *joydev, unsigned int cmd, void __user *argp) { struct input_dev *dev = joydev->handle.dev; size_t len; int i; const char *name; /* Process fixed-sized commands. */ switch (cmd) { case JS_SET_CAL: return copy_from_user(&joydev->glue.JS_CORR, argp, sizeof(joydev->glue.JS_CORR)) ? -EFAULT : 0; case JS_GET_CAL: return copy_to_user(argp, &joydev->glue.JS_CORR, sizeof(joydev->glue.JS_CORR)) ? -EFAULT : 0; case JS_SET_TIMEOUT: return get_user(joydev->glue.JS_TIMEOUT, (s32 __user *) argp); case JS_GET_TIMEOUT: return put_user(joydev->glue.JS_TIMEOUT, (s32 __user *) argp); case JSIOCGVERSION: return put_user(JS_VERSION, (__u32 __user *) argp); case JSIOCGAXES: return put_user(joydev->nabs, (__u8 __user *) argp); case JSIOCGBUTTONS: return put_user(joydev->nkey, (__u8 __user *) argp); case JSIOCSCORR: if (copy_from_user(joydev->corr, argp, sizeof(joydev->corr[0]) * joydev->nabs)) return -EFAULT; for (i = 0; i < joydev->nabs; i++) { int val = input_abs_get_val(dev, joydev->abspam[i]); joydev->abs[i] = joydev_correct(val, &joydev->corr[i]); } return 0; case JSIOCGCORR: return copy_to_user(argp, joydev->corr, sizeof(joydev->corr[0]) * joydev->nabs) ? -EFAULT : 0; } /* * Process variable-sized commands (the axis and button map commands * are considered variable-sized to decouple them from the values of * ABS_MAX and KEY_MAX). */ switch (cmd & ~IOCSIZE_MASK) { case (JSIOCSAXMAP & ~IOCSIZE_MASK): return joydev_handle_JSIOCSAXMAP(joydev, argp, _IOC_SIZE(cmd)); case (JSIOCGAXMAP & ~IOCSIZE_MASK): len = min_t(size_t, _IOC_SIZE(cmd), sizeof(joydev->abspam)); return copy_to_user(argp, joydev->abspam, len) ? -EFAULT : len; case (JSIOCSBTNMAP & ~IOCSIZE_MASK): return joydev_handle_JSIOCSBTNMAP(joydev, argp, _IOC_SIZE(cmd)); case (JSIOCGBTNMAP & ~IOCSIZE_MASK): len = min_t(size_t, _IOC_SIZE(cmd), sizeof(joydev->keypam)); return copy_to_user(argp, joydev->keypam, len) ? -EFAULT : len; case JSIOCGNAME(0): name = dev->name; if (!name) return 0; len = min_t(size_t, _IOC_SIZE(cmd), strlen(name) + 1); return copy_to_user(argp, name, len) ? -EFAULT : len; } return -EINVAL; } #ifdef CONFIG_COMPAT static long joydev_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct joydev_client *client = file->private_data; struct joydev *joydev = client->joydev; void __user *argp = (void __user *)arg; s32 tmp32; struct JS_DATA_SAVE_TYPE_32 ds32; int retval; retval = mutex_lock_interruptible(&joydev->mutex); if (retval) return retval; if (!joydev->exist) { retval = -ENODEV; goto out; } switch (cmd) { case JS_SET_TIMELIMIT: retval = get_user(tmp32, (s32 __user *) arg); if (retval == 0) joydev->glue.JS_TIMELIMIT = tmp32; break; case JS_GET_TIMELIMIT: tmp32 = joydev->glue.JS_TIMELIMIT; retval = put_user(tmp32, (s32 __user *) arg); break; case JS_SET_ALL: retval = copy_from_user(&ds32, argp, sizeof(ds32)) ? -EFAULT : 0; if (retval == 0) { joydev->glue.JS_TIMEOUT = ds32.JS_TIMEOUT; joydev->glue.BUSY = ds32.BUSY; joydev->glue.JS_EXPIRETIME = ds32.JS_EXPIRETIME; joydev->glue.JS_TIMELIMIT = ds32.JS_TIMELIMIT; joydev->glue.JS_SAVE = ds32.JS_SAVE; joydev->glue.JS_CORR = ds32.JS_CORR; } break; case JS_GET_ALL: ds32.JS_TIMEOUT = joydev->glue.JS_TIMEOUT; ds32.BUSY = joydev->glue.BUSY; ds32.JS_EXPIRETIME = joydev->glue.JS_EXPIRETIME; ds32.JS_TIMELIMIT = joydev->glue.JS_TIMELIMIT; ds32.JS_SAVE = joydev->glue.JS_SAVE; ds32.JS_CORR = joydev->glue.JS_CORR; retval = copy_to_user(argp, &ds32, sizeof(ds32)) ? -EFAULT : 0; break; default: retval = joydev_ioctl_common(joydev, cmd, argp); break; } out: mutex_unlock(&joydev->mutex); return retval; } #endif /* CONFIG_COMPAT */ static long joydev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct joydev_client *client = file->private_data; struct joydev *joydev = client->joydev; void __user *argp = (void __user *)arg; int retval; retval = mutex_lock_interruptible(&joydev->mutex); if (retval) return retval; if (!joydev->exist) { retval = -ENODEV; goto out; } switch (cmd) { case JS_SET_TIMELIMIT: retval = get_user(joydev->glue.JS_TIMELIMIT, (long __user *) arg); break; case JS_GET_TIMELIMIT: retval = put_user(joydev->glue.JS_TIMELIMIT, (long __user *) arg); break; case JS_SET_ALL: retval = copy_from_user(&joydev->glue, argp, sizeof(joydev->glue)) ? -EFAULT : 0; break; case JS_GET_ALL: retval = copy_to_user(argp, &joydev->glue, sizeof(joydev->glue)) ? -EFAULT : 0; break; default: retval = joydev_ioctl_common(joydev, cmd, argp); break; } out: mutex_unlock(&joydev->mutex); return retval; } static const struct file_operations joydev_fops = { .owner = THIS_MODULE, .read = joydev_read, .poll = joydev_poll, .open = joydev_open, .release = joydev_release, .unlocked_ioctl = joydev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = joydev_compat_ioctl, #endif .fasync = joydev_fasync, .llseek = no_llseek, }; /* * Mark device non-existent. This disables writes, ioctls and * prevents new users from opening the device. Already posted * blocking reads will stay, however new ones will fail. */ static void joydev_mark_dead(struct joydev *joydev) { mutex_lock(&joydev->mutex); joydev->exist = false; mutex_unlock(&joydev->mutex); } static void joydev_cleanup(struct joydev *joydev) { struct input_handle *handle = &joydev->handle; joydev_mark_dead(joydev); joydev_hangup(joydev); /* joydev is marked dead so no one else accesses joydev->open */ if (joydev->open) input_close_device(handle); } /* * These codes are copied from hid-ids.h, unfortunately there is no common * usb_ids/bt_ids.h header. */ #define USB_VENDOR_ID_SONY 0x054c #define USB_DEVICE_ID_SONY_PS3_CONTROLLER 0x0268 #define USB_DEVICE_ID_SONY_PS4_CONTROLLER 0x05c4 #define USB_DEVICE_ID_SONY_PS4_CONTROLLER_2 0x09cc #define USB_DEVICE_ID_SONY_PS4_CONTROLLER_DONGLE 0x0ba0 #define USB_VENDOR_ID_THQ 0x20d6 #define USB_DEVICE_ID_THQ_PS3_UDRAW 0xcb17 #define USB_VENDOR_ID_NINTENDO 0x057e #define USB_DEVICE_ID_NINTENDO_JOYCONL 0x2006 #define USB_DEVICE_ID_NINTENDO_JOYCONR 0x2007 #define USB_DEVICE_ID_NINTENDO_PROCON 0x2009 #define USB_DEVICE_ID_NINTENDO_CHRGGRIP 0x200E #define ACCEL_DEV(vnd, prd) \ { \ .flags = INPUT_DEVICE_ID_MATCH_VENDOR | \ INPUT_DEVICE_ID_MATCH_PRODUCT | \ INPUT_DEVICE_ID_MATCH_PROPBIT, \ .vendor = (vnd), \ .product = (prd), \ .propbit = { BIT_MASK(INPUT_PROP_ACCELEROMETER) }, \ } static const struct input_device_id joydev_blacklist[] = { /* Avoid touchpads and touchscreens */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, .evbit = { BIT_MASK(EV_KEY) }, .keybit = { [BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH) }, }, /* Avoid tablets, digitisers and similar devices */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, .evbit = { BIT_MASK(EV_KEY) }, .keybit = { [BIT_WORD(BTN_DIGI)] = BIT_MASK(BTN_DIGI) }, }, /* Disable accelerometers on composite devices */ ACCEL_DEV(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS3_CONTROLLER), ACCEL_DEV(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER), ACCEL_DEV(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_2), ACCEL_DEV(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_DONGLE), ACCEL_DEV(USB_VENDOR_ID_THQ, USB_DEVICE_ID_THQ_PS3_UDRAW), ACCEL_DEV(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_PROCON), ACCEL_DEV(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_CHRGGRIP), ACCEL_DEV(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_JOYCONL), ACCEL_DEV(USB_VENDOR_ID_NINTENDO, USB_DEVICE_ID_NINTENDO_JOYCONR), { /* sentinel */ } }; static bool joydev_dev_is_blacklisted(struct input_dev *dev) { const struct input_device_id *id; for (id = joydev_blacklist; id->flags; id++) { if (input_match_device_id(dev, id)) { dev_dbg(&dev->dev, "joydev: blacklisting '%s'\n", dev->name); return true; } } return false; } static bool joydev_dev_is_absolute_mouse(struct input_dev *dev) { DECLARE_BITMAP(jd_scratch, KEY_CNT); bool ev_match = false; BUILD_BUG_ON(ABS_CNT > KEY_CNT || EV_CNT > KEY_CNT); /* * Virtualization (VMware, etc) and remote management (HP * ILO2) solutions use absolute coordinates for their virtual * pointing devices so that there is one-to-one relationship * between pointer position on the host screen and virtual * guest screen, and so their mice use ABS_X, ABS_Y and 3 * primary button events. This clashes with what joydev * considers to be joysticks (a device with at minimum ABS_X * axis). * * Here we are trying to separate absolute mice from * joysticks. A device is, for joystick detection purposes, * considered to be an absolute mouse if the following is * true: * * 1) Event types are exactly * EV_ABS, EV_KEY and EV_SYN * or * EV_ABS, EV_KEY, EV_SYN and EV_MSC * or * EV_ABS, EV_KEY, EV_SYN, EV_MSC and EV_REL. * 2) Absolute events are exactly ABS_X and ABS_Y. * 3) Keys are exactly BTN_LEFT, BTN_RIGHT and BTN_MIDDLE. * 4) Device is not on "Amiga" bus. */ bitmap_zero(jd_scratch, EV_CNT); /* VMware VMMouse, HP ILO2 */ __set_bit(EV_ABS, jd_scratch); __set_bit(EV_KEY, jd_scratch); __set_bit(EV_SYN, jd_scratch); if (bitmap_equal(jd_scratch, dev->evbit, EV_CNT)) ev_match = true; /* HP ILO2, AMI BMC firmware */ __set_bit(EV_MSC, jd_scratch); if (bitmap_equal(jd_scratch, dev->evbit, EV_CNT)) ev_match = true; /* VMware Virtual USB Mouse, QEMU USB Tablet, ATEN BMC firmware */ __set_bit(EV_REL, jd_scratch); if (bitmap_equal(jd_scratch, dev->evbit, EV_CNT)) ev_match = true; if (!ev_match) return false; bitmap_zero(jd_scratch, ABS_CNT); __set_bit(ABS_X, jd_scratch); __set_bit(ABS_Y, jd_scratch); if (!bitmap_equal(dev->absbit, jd_scratch, ABS_CNT)) return false; bitmap_zero(jd_scratch, KEY_CNT); __set_bit(BTN_LEFT, jd_scratch); __set_bit(BTN_RIGHT, jd_scratch); __set_bit(BTN_MIDDLE, jd_scratch); if (!bitmap_equal(dev->keybit, jd_scratch, KEY_CNT)) return false; /* * Amiga joystick (amijoy) historically uses left/middle/right * button events. */ if (dev->id.bustype == BUS_AMIGA) return false; return true; } static bool joydev_match(struct input_handler *handler, struct input_dev *dev) { /* Disable blacklisted devices */ if (joydev_dev_is_blacklisted(dev)) return false; /* Avoid absolute mice */ if (joydev_dev_is_absolute_mouse(dev)) return false; return true; } static int joydev_connect(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id) { struct joydev *joydev; int i, j, t, minor, dev_no; int error; minor = input_get_new_minor(JOYDEV_MINOR_BASE, JOYDEV_MINORS, true); if (minor < 0) { error = minor; pr_err("failed to reserve new minor: %d\n", error); return error; } joydev = kzalloc(sizeof(struct joydev), GFP_KERNEL); if (!joydev) { error = -ENOMEM; goto err_free_minor; } INIT_LIST_HEAD(&joydev->client_list); spin_lock_init(&joydev->client_lock); mutex_init(&joydev->mutex); init_waitqueue_head(&joydev->wait); joydev->exist = true; dev_no = minor; /* Normalize device number if it falls into legacy range */ if (dev_no < JOYDEV_MINOR_BASE + JOYDEV_MINORS) dev_no -= JOYDEV_MINOR_BASE; dev_set_name(&joydev->dev, "js%d", dev_no); joydev->handle.dev = input_get_device(dev); joydev->handle.name = dev_name(&joydev->dev); joydev->handle.handler = handler; joydev->handle.private = joydev; for_each_set_bit(i, dev->absbit, ABS_CNT) { joydev->absmap[i] = joydev->nabs; joydev->abspam[joydev->nabs] = i; joydev->nabs++; } for (i = BTN_JOYSTICK - BTN_MISC; i < KEY_MAX - BTN_MISC + 1; i++) if (test_bit(i + BTN_MISC, dev->keybit)) { joydev->keymap[i] = joydev->nkey; joydev->keypam[joydev->nkey] = i + BTN_MISC; joydev->nkey++; } for (i = 0; i < BTN_JOYSTICK - BTN_MISC; i++) if (test_bit(i + BTN_MISC, dev->keybit)) { joydev->keymap[i] = joydev->nkey; joydev->keypam[joydev->nkey] = i + BTN_MISC; joydev->nkey++; } for (i = 0; i < joydev->nabs; i++) { j = joydev->abspam[i]; if (input_abs_get_max(dev, j) == input_abs_get_min(dev, j)) { joydev->corr[i].type = JS_CORR_NONE; continue; } joydev->corr[i].type = JS_CORR_BROKEN; joydev->corr[i].prec = input_abs_get_fuzz(dev, j); t = (input_abs_get_max(dev, j) + input_abs_get_min(dev, j)) / 2; joydev->corr[i].coef[0] = t - input_abs_get_flat(dev, j); joydev->corr[i].coef[1] = t + input_abs_get_flat(dev, j); t = (input_abs_get_max(dev, j) - input_abs_get_min(dev, j)) / 2 - 2 * input_abs_get_flat(dev, j); if (t) { joydev->corr[i].coef[2] = (1 << 29) / t; joydev->corr[i].coef[3] = (1 << 29) / t; } } joydev->dev.devt = MKDEV(INPUT_MAJOR, minor); joydev->dev.class = &input_class; joydev->dev.parent = &dev->dev; joydev->dev.release = joydev_free; device_initialize(&joydev->dev); error = input_register_handle(&joydev->handle); if (error) goto err_free_joydev; cdev_init(&joydev->cdev, &joydev_fops); error = cdev_device_add(&joydev->cdev, &joydev->dev); if (error) goto err_cleanup_joydev; return 0; err_cleanup_joydev: joydev_cleanup(joydev); input_unregister_handle(&joydev->handle); err_free_joydev: put_device(&joydev->dev); err_free_minor: input_free_minor(minor); return error; } static void joydev_disconnect(struct input_handle *handle) { struct joydev *joydev = handle->private; cdev_device_del(&joydev->cdev, &joydev->dev); joydev_cleanup(joydev); input_free_minor(MINOR(joydev->dev.devt)); input_unregister_handle(handle); put_device(&joydev->dev); } static const struct input_device_id joydev_ids[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_ABS) }, .absbit = { BIT_MASK(ABS_X) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_ABS) }, .absbit = { BIT_MASK(ABS_Z) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_ABS) }, .absbit = { BIT_MASK(ABS_WHEEL) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_ABS) }, .absbit = { BIT_MASK(ABS_THROTTLE) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, .evbit = { BIT_MASK(EV_KEY) }, .keybit = {[BIT_WORD(BTN_JOYSTICK)] = BIT_MASK(BTN_JOYSTICK) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, .evbit = { BIT_MASK(EV_KEY) }, .keybit = { [BIT_WORD(BTN_GAMEPAD)] = BIT_MASK(BTN_GAMEPAD) }, }, { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, .evbit = { BIT_MASK(EV_KEY) }, .keybit = { [BIT_WORD(BTN_TRIGGER_HAPPY)] = BIT_MASK(BTN_TRIGGER_HAPPY) }, }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(input, joydev_ids); static struct input_handler joydev_handler = { .event = joydev_event, .match = joydev_match, .connect = joydev_connect, .disconnect = joydev_disconnect, .legacy_minors = true, .minor = JOYDEV_MINOR_BASE, .name = "joydev", .id_table = joydev_ids, }; static int __init joydev_init(void) { return input_register_handler(&joydev_handler); } static void __exit joydev_exit(void) { input_unregister_handler(&joydev_handler); } module_init(joydev_init); module_exit(joydev_exit);
5 4 3 3 3 3 3 2 2 1 1 1 1 2 1 1 1 1 2 1 5 2 2 2 2 1 1 2 1 1 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> * Copyright (c) 2012 Intel Corporation */ #include <linux/module.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/string.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_tables.h> #include <net/ip.h> struct nft_nat { u8 sreg_addr_min; u8 sreg_addr_max; u8 sreg_proto_min; u8 sreg_proto_max; enum nf_nat_manip_type type:8; u8 family; u16 flags; }; static void nft_nat_setup_addr(struct nf_nat_range2 *range, const struct nft_regs *regs, const struct nft_nat *priv) { switch (priv->family) { case AF_INET: range->min_addr.ip = (__force __be32) regs->data[priv->sreg_addr_min]; range->max_addr.ip = (__force __be32) regs->data[priv->sreg_addr_max]; break; case AF_INET6: memcpy(range->min_addr.ip6, &regs->data[priv->sreg_addr_min], sizeof(range->min_addr.ip6)); memcpy(range->max_addr.ip6, &regs->data[priv->sreg_addr_max], sizeof(range->max_addr.ip6)); break; } } static void nft_nat_setup_proto(struct nf_nat_range2 *range, const struct nft_regs *regs, const struct nft_nat *priv) { range->min_proto.all = (__force __be16) nft_reg_load16(&regs->data[priv->sreg_proto_min]); range->max_proto.all = (__force __be16) nft_reg_load16(&regs->data[priv->sreg_proto_max]); } static void nft_nat_setup_netmap(struct nf_nat_range2 *range, const struct nft_pktinfo *pkt, const struct nft_nat *priv) { struct sk_buff *skb = pkt->skb; union nf_inet_addr new_addr; __be32 netmask; int i, len = 0; switch (priv->type) { case NFT_NAT_SNAT: if (nft_pf(pkt) == NFPROTO_IPV4) { new_addr.ip = ip_hdr(skb)->saddr; len = sizeof(struct in_addr); } else { new_addr.in6 = ipv6_hdr(skb)->saddr; len = sizeof(struct in6_addr); } break; case NFT_NAT_DNAT: if (nft_pf(pkt) == NFPROTO_IPV4) { new_addr.ip = ip_hdr(skb)->daddr; len = sizeof(struct in_addr); } else { new_addr.in6 = ipv6_hdr(skb)->daddr; len = sizeof(struct in6_addr); } break; } for (i = 0; i < len / sizeof(__be32); i++) { netmask = ~(range->min_addr.ip6[i] ^ range->max_addr.ip6[i]); new_addr.ip6[i] &= ~netmask; new_addr.ip6[i] |= range->min_addr.ip6[i] & netmask; } range->min_addr = new_addr; range->max_addr = new_addr; } static void nft_nat_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_nat *priv = nft_expr_priv(expr); enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo); struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); if (priv->sreg_addr_min) { nft_nat_setup_addr(&range, regs, priv); if (priv->flags & NF_NAT_RANGE_NETMAP) nft_nat_setup_netmap(&range, pkt, priv); } if (priv->sreg_proto_min) nft_nat_setup_proto(&range, regs, priv); range.flags = priv->flags; regs->verdict.code = nf_nat_setup_info(ct, &range, priv->type); } static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_TYPE] = { .type = NLA_U32 }, [NFTA_NAT_FAMILY] = { .type = NLA_U32 }, [NFTA_NAT_REG_ADDR_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, [NFTA_NAT_FLAGS] = NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_nat_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { struct nft_nat *priv = nft_expr_priv(expr); int err; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); if (err < 0) return err; switch (priv->type) { case NFT_NAT_SNAT: err = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN)); break; case NFT_NAT_DNAT: err = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)); break; } return err; } static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_nat *priv = nft_expr_priv(expr); unsigned int alen, plen; u32 family; int err; if (tb[NFTA_NAT_TYPE] == NULL || (tb[NFTA_NAT_REG_ADDR_MIN] == NULL && tb[NFTA_NAT_REG_PROTO_MIN] == NULL)) return -EINVAL; switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) { case NFT_NAT_SNAT: priv->type = NF_NAT_MANIP_SRC; break; case NFT_NAT_DNAT: priv->type = NF_NAT_MANIP_DST; break; default: return -EOPNOTSUPP; } if (tb[NFTA_NAT_FAMILY] == NULL) return -EINVAL; family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); if (ctx->family != NFPROTO_INET && ctx->family != family) return -EOPNOTSUPP; switch (family) { case NFPROTO_IPV4: alen = sizeof_field(struct nf_nat_range, min_addr.ip); break; case NFPROTO_IPV6: alen = sizeof_field(struct nf_nat_range, min_addr.ip6); break; default: if (tb[NFTA_NAT_REG_ADDR_MIN]) return -EAFNOSUPPORT; break; } priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MIN], &priv->sreg_addr_min, alen); if (err < 0) return err; if (tb[NFTA_NAT_REG_ADDR_MAX]) { err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MAX], &priv->sreg_addr_max, alen); if (err < 0) return err; } else { priv->sreg_addr_max = priv->sreg_addr_min; } priv->flags |= NF_NAT_RANGE_MAP_IPS; } plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_NAT_REG_PROTO_MAX]) { err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) return err; } else { priv->sreg_proto_max = priv->sreg_proto_min; } priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[NFTA_NAT_FLAGS]) priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); return nf_ct_netns_get(ctx->net, family); } static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_nat *priv = nft_expr_priv(expr); switch (priv->type) { case NF_NAT_MANIP_SRC: if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT))) goto nla_put_failure; break; case NF_NAT_MANIP_DST: if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT))) goto nla_put_failure; break; } if (nla_put_be32(skb, NFTA_NAT_FAMILY, htonl(priv->family))) goto nla_put_failure; if (priv->sreg_addr_min) { if (nft_dump_register(skb, NFTA_NAT_REG_ADDR_MIN, priv->sreg_addr_min) || nft_dump_register(skb, NFTA_NAT_REG_ADDR_MAX, priv->sreg_addr_max)) goto nla_put_failure; } if (priv->sreg_proto_min) { if (nft_dump_register(skb, NFTA_NAT_REG_PROTO_MIN, priv->sreg_proto_min) || nft_dump_register(skb, NFTA_NAT_REG_PROTO_MAX, priv->sreg_proto_max)) goto nla_put_failure; } if (priv->flags != 0) { if (nla_put_be32(skb, NFTA_NAT_FLAGS, htonl(priv->flags))) goto nla_put_failure; } return 0; nla_put_failure: return -1; } static void nft_nat_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_nat *priv = nft_expr_priv(expr); nf_ct_netns_put(ctx->net, priv->family); } static struct nft_expr_type nft_nat_type; static const struct nft_expr_ops nft_nat_ops = { .type = &nft_nat_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), .eval = nft_nat_eval, .init = nft_nat_init, .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_nat_type __read_mostly = { .name = "nat", .ops = &nft_nat_ops, .policy = nft_nat_policy, .maxattr = NFTA_NAT_MAX, .owner = THIS_MODULE, }; #ifdef CONFIG_NF_TABLES_INET static void nft_nat_inet_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_nat *priv = nft_expr_priv(expr); if (priv->family == nft_pf(pkt) || priv->family == NFPROTO_INET) nft_nat_eval(expr, regs, pkt); } static const struct nft_expr_ops nft_nat_inet_ops = { .type = &nft_nat_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), .eval = nft_nat_inet_eval, .init = nft_nat_init, .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_inet_nat_type __read_mostly = { .name = "nat", .family = NFPROTO_INET, .ops = &nft_nat_inet_ops, .policy = nft_nat_policy, .maxattr = NFTA_NAT_MAX, .owner = THIS_MODULE, }; static int nft_nat_inet_module_init(void) { return nft_register_expr(&nft_inet_nat_type); } static void nft_nat_inet_module_exit(void) { nft_unregister_expr(&nft_inet_nat_type); } #else static int nft_nat_inet_module_init(void) { return 0; } static void nft_nat_inet_module_exit(void) { } #endif static int __init nft_nat_module_init(void) { int ret = nft_nat_inet_module_init(); if (ret) return ret; ret = nft_register_expr(&nft_nat_type); if (ret) nft_nat_inet_module_exit(); return ret; } static void __exit nft_nat_module_exit(void) { nft_nat_inet_module_exit(); nft_unregister_expr(&nft_nat_type); } module_init(nft_nat_module_init); module_exit(nft_nat_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); MODULE_ALIAS_NFT_EXPR("nat"); MODULE_DESCRIPTION("Network Address Translation support");
2 1 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 32 33 1607 19 1608 1608 1602 1605 1602 1610 1613 1610 1604 1609 582 580 582 581 580 583 578 577 579 1600 472 114 1 116 1386 424 39 3 39 3 8 1 1 1 1 1 1 2 2 1 1 1 1 2 2 1 1 2 2 1 1 4 3 2 2 2 1 1 4 1 4 4 4 21 22 1 2 2 2 2 2 2 2 1 1 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } static unsigned long enforced_nproc_rlimit(void) { unsigned long limit = RLIM_INFINITY; /* Is RLIMIT_NPROC currently enforced? */ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || (current_user_ns() != &init_user_ns)) limit = rlimit(RLIMIT_NPROC); return limit; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = security_create_user_ns(new); if (ret < 0) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; ns->ns.ops = &userns_operations; refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } #if IS_ENABLED(CONFIG_BINFMT_MISC) kfree(ns->binfmt_misc); #endif retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; } while (refcount_dec_and_test(&parent->ns.count)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /* * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; /* == 0 unless used with map_id_range_down() */ }; /* * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /* * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /* * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /* * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) { unsigned idx; u32 first, last; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) return &map->extent[idx]; } return NULL; } /* * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) { struct idmap_key key; key.map_up = true; key.count = 1; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } u32 map_id_up(struct uid_gid_map *map, u32 id) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_up_base(extents, map, id); else extent = map_id_up_max(extents, map, id); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /* * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /* * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup(map->forward, map->nr_extents * sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } /** * verify_root_map() - check the uid 0 mapping * @file: idmapping file * @map_ns: user namespace of the target process * @new_map: requested idmap * * If a process requests mapping parent uid 0 into the new ns, verify that the * process writing the map had the CAP_SETFCAP capability as the target process * will be able to write fscaps that are valid in ancestor user namespaces. * * Return: true if the mapping is allowed, false if not. */ static bool verify_root_map(const struct file *file, struct user_namespace *map_ns, struct uid_gid_map *new_map) { int idx; const struct user_namespace *file_ns = file->f_cred->user_ns; struct uid_gid_extent *extent0 = NULL; for (idx = 0; idx < new_map->nr_extents; idx++) { if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent0 = &new_map->extent[idx]; else extent0 = &new_map->forward[idx]; if (extent0->lower_first == 0) break; extent0 = NULL; } if (!extent0) return true; if (map_ns == file_ns) { /* The process unshared its ns and is writing to its own * /proc/self/uid_map. User already has full capabilites in * the new namespace. Verify that the parent had CAP_SETFCAP * when it unshared. * */ if (!file_ns->parent_could_setfcap) return false; } else { /* Process p1 is writing to uid_map of p2, who is in a child * user namespace to p1's. Verify that the opener of the map * file has CAP_SETFCAP against the parent of the new map * namespace */ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) return false; } return true; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) return false; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = nsset_cred(nsset); if (!cred) return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); if (set_cred_ucounts(cred) < 0) return -EINVAL; return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init);
6 6 6 6 5 5 6 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 /* * Copyright (c) 2006-2008 Intel Corporation * Copyright (c) 2007 Dave Airlie <airlied@linux.ie> * Copyright (c) 2008 Red Hat Inc. * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #include <drm/drm_device.h> #include <drm/drm_drv.h> #include <drm/drm_gem.h> #include <drm/drm_mode.h> #include "drm_crtc_internal.h" #include "drm_internal.h" /** * DOC: overview * * The KMS API doesn't standardize backing storage object creation and leaves it * to driver-specific ioctls. Furthermore actually creating a buffer object even * for GEM-based drivers is done through a driver-specific ioctl - GEM only has * a common userspace interface for sharing and destroying objects. While not an * issue for full-fledged graphics stacks that include device-specific userspace * components (in libdrm for instance), this limit makes DRM-based early boot * graphics unnecessarily complex. * * Dumb objects partly alleviate the problem by providing a standard API to * create dumb buffers suitable for scanout, which can then be used to create * KMS frame buffers. * * To support dumb objects drivers must implement the &drm_driver.dumb_create * and &drm_driver.dumb_map_offset operations (the latter defaults to * drm_gem_dumb_map_offset() if not set). Drivers that don't use GEM handles * additionally need to implement the &drm_driver.dumb_destroy operation. See * the callbacks for further details. * * Note that dumb objects may not be used for gpu acceleration, as has been * attempted on some ARM embedded platforms. Such drivers really must have * a hardware-specific ioctl to allocate suitable buffer objects. */ int drm_mode_create_dumb(struct drm_device *dev, struct drm_mode_create_dumb *args, struct drm_file *file_priv) { u32 cpp, stride, size; if (!dev->driver->dumb_create) return -ENOSYS; if (!args->width || !args->height || !args->bpp) return -EINVAL; /* overflow checks for 32bit size calculations */ if (args->bpp > U32_MAX - 8) return -EINVAL; cpp = DIV_ROUND_UP(args->bpp, 8); if (cpp > U32_MAX / args->width) return -EINVAL; stride = cpp * args->width; if (args->height > U32_MAX / stride) return -EINVAL; /* test for wrap-around */ size = args->height * stride; if (PAGE_ALIGN(size) == 0) return -EINVAL; /* * handle, pitch and size are output parameters. Zero them out to * prevent drivers from accidentally using uninitialized data. Since * not all existing userspace is clearing these fields properly we * cannot reject IOCTL with garbage in them. */ args->handle = 0; args->pitch = 0; args->size = 0; return dev->driver->dumb_create(file_priv, dev, args); } int drm_mode_create_dumb_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return drm_mode_create_dumb(dev, data, file_priv); } /** * drm_mode_mmap_dumb_ioctl - create an mmap offset for a dumb backing storage buffer * @dev: DRM device * @data: ioctl data * @file_priv: DRM file info * * Allocate an offset in the drm device node's address space to be able to * memory map a dumb buffer. * * Called by the user via ioctl. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_mmap_dumb_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_map_dumb *args = data; if (!dev->driver->dumb_create) return -ENOSYS; if (dev->driver->dumb_map_offset) return dev->driver->dumb_map_offset(file_priv, dev, args->handle, &args->offset); else return drm_gem_dumb_map_offset(file_priv, dev, args->handle, &args->offset); } int drm_mode_destroy_dumb(struct drm_device *dev, u32 handle, struct drm_file *file_priv) { if (!dev->driver->dumb_create) return -ENOSYS; return drm_gem_handle_delete(file_priv, handle); } int drm_mode_destroy_dumb_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_destroy_dumb *args = data; return drm_mode_destroy_dumb(dev, args->handle, file_priv); }
4 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_LABELS_H #define _NF_CONNTRACK_LABELS_H #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <linux/types.h> #include <net/net_namespace.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <uapi/linux/netfilter/xt_connlabel.h> #define NF_CT_LABELS_MAX_SIZE ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE) struct nf_conn_labels { unsigned long bits[NF_CT_LABELS_MAX_SIZE / sizeof(long)]; }; /* Can't use nf_ct_ext_find(), flow dissector cannot use symbols * exported by nf_conntrack module. */ static inline struct nf_conn_labels *nf_ct_labels_find(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_LABELS struct nf_ct_ext *ext = ct->ext; if (!ext || !__nf_ct_ext_exist(ext, NF_CT_EXT_LABELS)) return NULL; return (void *)ct->ext + ct->ext->offset[NF_CT_EXT_LABELS]; #else return NULL; #endif } static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_LABELS struct net *net = nf_ct_net(ct); if (atomic_read(&net->ct.labels_used) == 0) return NULL; return nf_ct_ext_add(ct, NF_CT_EXT_LABELS, GFP_ATOMIC); #else return NULL; #endif } int nf_connlabels_replace(struct nf_conn *ct, const u32 *data, const u32 *mask, unsigned int words); #ifdef CONFIG_NF_CONNTRACK_LABELS int nf_connlabels_get(struct net *net, unsigned int bit); void nf_connlabels_put(struct net *net); #else static inline int nf_connlabels_get(struct net *net, unsigned int bit) { return 0; } static inline void nf_connlabels_put(struct net *net) {} #endif #endif /* _NF_CONNTRACK_LABELS_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MBCACHE_H #define _LINUX_MBCACHE_H #include <linux/hash.h> #include <linux/list_bl.h> #include <linux/list.h> #include <linux/atomic.h> #include <linux/fs.h> struct mb_cache; /* Cache entry flags */ enum { MBE_REFERENCED_B = 0, MBE_REUSABLE_B }; struct mb_cache_entry { /* List of entries in cache - protected by cache->c_list_lock */ struct list_head e_list; /* * Hash table list - protected by hash chain bitlock. The entry is * guaranteed to be hashed while e_refcnt > 0. */ struct hlist_bl_node e_hash_list; /* * Entry refcount. Once it reaches zero, entry is unhashed and freed. * While refcount > 0, the entry is guaranteed to stay in the hash and * e.g. mb_cache_entry_try_delete() will fail. */ atomic_t e_refcnt; /* Key in hash - stable during lifetime of the entry */ u32 e_key; unsigned long e_flags; /* User provided value - stable during lifetime of the entry */ u64 e_value; }; struct mb_cache *mb_cache_create(int bucket_bits); void mb_cache_destroy(struct mb_cache *cache); int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, u64 value, bool reusable); void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry); void mb_cache_entry_wait_unused(struct mb_cache_entry *entry); static inline void mb_cache_entry_put(struct mb_cache *cache, struct mb_cache_entry *entry) { unsigned int cnt = atomic_dec_return(&entry->e_refcnt); if (cnt > 0) { if (cnt <= 2) wake_up_var(&entry->e_refcnt); return; } __mb_cache_entry_free(cache, entry); } struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache, u32 key, u64 value); struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, u64 value); struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, u32 key); struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, struct mb_cache_entry *entry); void mb_cache_entry_touch(struct mb_cache *cache, struct mb_cache_entry *entry); #endif /* _LINUX_MBCACHE_H */
2 1 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 // SPDX-License-Identifier: GPL-2.0 /* * Performance events callchain code, extracted from core.c: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/perf_event.h> #include <linux/slab.h> #include <linux/sched/task_stack.h> #include "internal.h" struct callchain_cpus_entries { struct rcu_head rcu_head; struct perf_callchain_entry *cpu_entries[]; }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; static inline size_t perf_callchain_entry__sizeof(void) { return (sizeof(struct perf_callchain_entry) + sizeof(__u64) * (sysctl_perf_event_max_stack + sysctl_perf_event_max_contexts_per_stack)); } static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); static struct callchain_cpus_entries *callchain_cpus_entries; __weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } __weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } static void release_callchain_buffers_rcu(struct rcu_head *head) { struct callchain_cpus_entries *entries; int cpu; entries = container_of(head, struct callchain_cpus_entries, rcu_head); for_each_possible_cpu(cpu) kfree(entries->cpu_entries[cpu]); kfree(entries); } static void release_callchain_buffers(void) { struct callchain_cpus_entries *entries; entries = callchain_cpus_entries; RCU_INIT_POINTER(callchain_cpus_entries, NULL); call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); } static int alloc_callchain_buffers(void) { int cpu; int size; struct callchain_cpus_entries *entries; /* * We can't use the percpu allocation API for data that can be * accessed from NMI. Use a temporary manual per cpu allocation * until that gets sorted out. */ size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); entries = kzalloc(size, GFP_KERNEL); if (!entries) return -ENOMEM; size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS; for_each_possible_cpu(cpu) { entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (!entries->cpu_entries[cpu]) goto fail; } rcu_assign_pointer(callchain_cpus_entries, entries); return 0; fail: for_each_possible_cpu(cpu) kfree(entries->cpu_entries[cpu]); kfree(entries); return -ENOMEM; } int get_callchain_buffers(int event_max_stack) { int err = 0; int count; mutex_lock(&callchain_mutex); count = atomic_inc_return(&nr_callchain_events); if (WARN_ON_ONCE(count < 1)) { err = -EINVAL; goto exit; } /* * If requesting per event more than the global cap, * return a different error to help userspace figure * this out. * * And also do it here so that we have &callchain_mutex held. */ if (event_max_stack > sysctl_perf_event_max_stack) { err = -EOVERFLOW; goto exit; } if (count == 1) err = alloc_callchain_buffers(); exit: if (err) atomic_dec(&nr_callchain_events); mutex_unlock(&callchain_mutex); return err; } void put_callchain_buffers(void) { if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { release_callchain_buffers(); mutex_unlock(&callchain_mutex); } } struct perf_callchain_entry *get_callchain_entry(int *rctx) { int cpu; struct callchain_cpus_entries *entries; *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion)); if (*rctx == -1) return NULL; entries = rcu_dereference(callchain_cpus_entries); if (!entries) { put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx); return NULL; } cpu = smp_processor_id(); return (((void *)entries->cpu_entries[cpu]) + (*rctx * perf_callchain_entry__sizeof())); } void put_callchain_entry(int rctx) { put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); } struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; int rctx; entry = get_callchain_entry(&rctx); if (!entry) return NULL; ctx.entry = entry; ctx.max_stack = max_stack; ctx.nr = entry->nr = init_nr; ctx.contexts = 0; ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); perf_callchain_kernel(&ctx, regs); } if (user) { if (!user_mode(regs)) { if (current->mm) regs = task_pt_regs(current); else regs = NULL; } if (regs) { if (crosstask) goto exit_put; if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); perf_callchain_user(&ctx, regs); } } exit_put: put_callchain_entry(rctx); return entry; } /* * Used for sysctl_perf_event_max_stack and * sysctl_perf_event_max_contexts_per_stack. */ int perf_event_max_stack_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; struct ctl_table new_table = *table; new_table.data = &new_value; ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos); if (ret || !write) return ret; mutex_lock(&callchain_mutex); if (atomic_read(&nr_callchain_events)) ret = -EBUSY; else *value = new_value; mutex_unlock(&callchain_mutex); return ret; }
784 15 15 15 15 15 15 14 15 15 15 15 1 1 15 15 14 13 15 15 13 15 15 1 15 3 3 3 1 2 2 6 6 6 2 8 8 13 13 8 8 8 8 8 2 8 2 1 1 1 1 8 1 1 1 6 6 6 6 2 2 8 8 7 5 5 2 6 5 8 8 8 8 8 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 // SPDX-License-Identifier: GPL-2.0-only /* * mm/readahead.c - address_space-level file readahead. * * Copyright (C) 2002, Linus Torvalds * * 09Apr2002 Andrew Morton * Initial version. */ /** * DOC: Readahead Overview * * Readahead is used to read content into the page cache before it is * explicitly requested by the application. Readahead only ever * attempts to read folios that are not yet in the page cache. If a * folio is present but not up-to-date, readahead will not try to read * it. In that case a simple ->read_folio() will be requested. * * Readahead is triggered when an application read request (whether a * system call or a page fault) finds that the requested folio is not in * the page cache, or that it is in the page cache and has the * readahead flag set. This flag indicates that the folio was read * as part of a previous readahead request and now that it has been * accessed, it is time for the next readahead. * * Each readahead request is partly synchronous read, and partly async * readahead. This is reflected in the struct file_ra_state which * contains ->size being the total number of pages, and ->async_size * which is the number of pages in the async section. The readahead * flag will be set on the first folio in this async section to trigger * a subsequent readahead. Once a series of sequential reads has been * established, there should be no need for a synchronous component and * all readahead request will be fully asynchronous. * * When either of the triggers causes a readahead, three numbers need * to be determined: the start of the region to read, the size of the * region, and the size of the async tail. * * The start of the region is simply the first page address at or after * the accessed address, which is not currently populated in the page * cache. This is found with a simple search in the page cache. * * The size of the async tail is determined by subtracting the size that * was explicitly requested from the determined request size, unless * this would be less than zero - then zero is used. NOTE THIS * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED * PAGE. ALSO THIS CALCULATION IS NOT USED CONSISTENTLY. * * The size of the region is normally determined from the size of the * previous readahead which loaded the preceding pages. This may be * discovered from the struct file_ra_state for simple sequential reads, * or from examining the state of the page cache when multiple * sequential reads are interleaved. Specifically: where the readahead * was triggered by the readahead flag, the size of the previous * readahead is assumed to be the number of pages from the triggering * page to the start of the new readahead. In these cases, the size of * the previous readahead is scaled, often doubled, for the new * readahead, though see get_next_ra_size() for details. * * If the size of the previous read cannot be determined, the number of * preceding pages in the page cache is used to estimate the size of * a previous read. This estimate could easily be misled by random * reads being coincidentally adjacent, so it is ignored unless it is * larger than the current request, and it is not scaled up, unless it * is at the start of file. * * In general readahead is accelerated at the start of the file, as * reads from there are often sequential. There are other minor * adjustments to the readahead size in various special cases and these * are best discovered by reading the code. * * The above calculation, based on the previous readahead size, * determines the size of the readahead, to which any requested read * size may be added. * * Readahead requests are sent to the filesystem using the ->readahead() * address space operation, for which mpage_readahead() is a canonical * implementation. ->readahead() should normally initiate reads on all * folios, but may fail to read any or all folios without causing an I/O * error. The page cache reading code will issue a ->read_folio() request * for any folio which ->readahead() did not read, and only an error * from this will be final. * * ->readahead() will generally call readahead_folio() repeatedly to get * each folio from those prepared for readahead. It may fail to read a * folio by: * * * not calling readahead_folio() sufficiently many times, effectively * ignoring some folios, as might be appropriate if the path to * storage is congested. * * * failing to actually submit a read request for a given folio, * possibly due to insufficient resources, or * * * getting an error during subsequent processing of a request. * * In the last two cases, the folio should be unlocked by the filesystem * to indicate that the read attempt has failed. In the first case the * folio will be unlocked by the VFS. * * Those folios not in the final ``async_size`` of the request should be * considered to be important and ->readahead() should not fail them due * to congestion or temporary resource unavailability, but should wait * for necessary resources (e.g. memory or indexing information) to * become available. Folios in the final ``async_size`` may be * considered less urgent and failure to read them is more acceptable. * In this case it is best to use filemap_remove_folio() to remove the * folios from the page cache as is automatically done for folios that * were not fetched with readahead_folio(). This will allow a * subsequent synchronous readahead request to try them again. If they * are left in the page cache, then they will be read individually using * ->read_folio() which may be less efficient. */ #include <linux/blkdev.h> #include <linux/kernel.h> #include <linux/dax.h> #include <linux/gfp.h> #include <linux/export.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/pagemap.h> #include <linux/psi.h> #include <linux/syscalls.h> #include <linux/file.h> #include <linux/mm_inline.h> #include <linux/blk-cgroup.h> #include <linux/fadvise.h> #include <linux/sched/mm.h> #include "internal.h" /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. */ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; ra->prev_pos = -1; } EXPORT_SYMBOL_GPL(file_ra_state_init); static void read_pages(struct readahead_control *rac) { const struct address_space_operations *aops = rac->mapping->a_ops; struct folio *folio; struct blk_plug plug; if (!readahead_count(rac)) return; if (unlikely(rac->_workingset)) psi_memstall_enter(&rac->_pflags); blk_start_plug(&plug); if (aops->readahead) { aops->readahead(rac); /* * Clean up the remaining folios. The sizes in ->ra * may be used to size the next readahead, so make sure * they accurately reflect what happened. */ while ((folio = readahead_folio(rac)) != NULL) { unsigned long nr = folio_nr_pages(folio); folio_get(folio); rac->ra->size -= nr; if (rac->ra->async_size >= nr) { rac->ra->async_size -= nr; filemap_remove_folio(folio); } folio_unlock(folio); folio_put(folio); } } else { while ((folio = readahead_folio(rac)) != NULL) aops->read_folio(rac->file, folio); } blk_finish_plug(&plug); if (unlikely(rac->_workingset)) psi_memstall_leave(&rac->_pflags); rac->_workingset = false; BUG_ON(readahead_count(rac)); } /** * page_cache_ra_unbounded - Start unchecked readahead. * @ractl: Readahead control. * @nr_to_read: The number of pages to read. * @lookahead_size: Where to start the next readahead. * * This function is for filesystems to call when they want to start * readahead beyond a file's stated i_size. This is almost certainly * not the function you want to call. Use page_cache_async_readahead() * or page_cache_sync_readahead() instead. * * Context: File is referenced by caller. Mutexes may be held by caller. * May sleep, but will not reenter filesystem to reclaim memory. */ void page_cache_ra_unbounded(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct address_space *mapping = ractl->mapping; unsigned long index = readahead_index(ractl); gfp_t gfp_mask = readahead_gfp_mask(mapping); unsigned long i; /* * Partway through the readahead operation, we will have added * locked pages to the page cache, but will not yet have submitted * them for I/O. Adding another page may need to allocate memory, * which can trigger memory reclaim. Telling the VM we're in * the middle of a filesystem operation will cause it to not * touch file-backed pages, preventing a deadlock. Most (all?) * filesystems already specify __GFP_NOFS in their mapping's * gfp_mask, but let's be explicit here. */ unsigned int nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); /* * Preallocate as many pages as we will need. */ for (i = 0; i < nr_to_read; i++) { struct folio *folio = xa_load(&mapping->i_pages, index + i); int ret; if (folio && !xa_is_value(folio)) { /* * Page already present? Kick off the current batch * of contiguous pages before continuing with the * next batch. This page may be the one we would * have intended to mark as Readahead, but we don't * have a stable reference to this page, and it's * not worth getting one just for that. */ read_pages(ractl); ractl->_index++; i = ractl->_index + ractl->_nr_pages - index - 1; continue; } folio = filemap_alloc_folio(gfp_mask, 0); if (!folio) break; ret = filemap_add_folio(mapping, folio, index + i, gfp_mask); if (ret < 0) { folio_put(folio); if (ret == -ENOMEM) break; read_pages(ractl); ractl->_index++; i = ractl->_index + ractl->_nr_pages - index - 1; continue; } if (i == nr_to_read - lookahead_size) folio_set_readahead(folio); ractl->_workingset |= folio_test_workingset(folio); ractl->_nr_pages++; } /* * Now start the IO. We ignore I/O errors - if the folio is not * uptodate then the caller will launch read_folio again, and * will then handle the error. */ read_pages(ractl); filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); } EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); /* * do_page_cache_ra() actually reads a chunk of disk. It allocates * the pages first, then submits them for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. */ static void do_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct inode *inode = ractl->mapping->host; unsigned long index = readahead_index(ractl); loff_t isize = i_size_read(inode); pgoff_t end_index; /* The last page we want to read */ if (isize == 0) return; end_index = (isize - 1) >> PAGE_SHIFT; if (index > end_index) return; /* Don't read past the page containing the last byte of the file */ if (nr_to_read > end_index - index) nr_to_read = end_index - index + 1; page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size); } /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much * memory at once. */ void force_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read) { struct address_space *mapping = ractl->mapping; struct file_ra_state *ra = ractl->ra; struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages, index; if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead)) return; /* * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ index = readahead_index(ractl); max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); nr_to_read = min_t(unsigned long, nr_to_read, max_pages); while (nr_to_read) { unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; if (this_chunk > nr_to_read) this_chunk = nr_to_read; ractl->_index = index; do_page_cache_ra(ractl, this_chunk, 0); index += this_chunk; nr_to_read -= this_chunk; } } /* * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large * for 128k (32 page) max ra * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial */ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) { unsigned long newsize = roundup_pow_of_two(size); if (newsize <= max / 32) newsize = newsize * 4; else if (newsize <= max / 4) newsize = newsize * 2; else newsize = max; return newsize; } /* * Get the previous window size, ramp it up, and * return it as the new window size. */ static unsigned long get_next_ra_size(struct file_ra_state *ra, unsigned long max) { unsigned long cur = ra->size; if (cur < max / 16) return 4 * cur; if (cur <= max / 2) return 2 * cur; return max; } /* * On-demand readahead design. * * The fields in struct file_ra_state represent the most-recently-executed * readahead attempt: * * |<----- async_size ---------| * |------------------- size -------------------->| * |==================#===========================| * ^start ^page marked with PG_readahead * * To overlap application thinking time and disk I/O time, we do * `readahead pipelining': Do not wait until the application consumed all * readahead pages and stalled on the missing page at readahead_index; * Instead, submit an asynchronous readahead I/O as soon as there are * only async_size pages left in the readahead window. Normally async_size * will be equal to size, for maximum pipelining. * * In interleaved sequential reads, concurrent streams on the same fd can * be invalidating each other's readahead state. So we flag the new readahead * page at (start+size-async_size) with PG_readahead, and use it as readahead * indicator. The flag won't be set on already cached pages, to avoid the * readahead-for-nothing fuss, saving pointless page cache lookups. * * prev_pos tracks the last visited byte in the _previous_ read request. * It should be maintained by the caller, and will be used for detecting * small random reads. Note that the readahead algorithm checks loosely * for sequential patterns. Hence interleaved reads might be served as * sequential ones. * * There is a special-case: if the first page which the application tries to * read happens to be the first page of the file, it is assumed that a linear * read is about to happen and the window is immediately set to the initial size * based on I/O request size and the max_readahead. * * The code ramps up the readahead size aggressively at first, but slow down as * it approaches max_readhead. */ /* * Count contiguously cached pages from @index-1 to @index-@max, * this count is a conservative estimation of * - length of the sequential read sequence, or * - thrashing threshold in memory tight systems */ static pgoff_t count_history_pages(struct address_space *mapping, pgoff_t index, unsigned long max) { pgoff_t head; rcu_read_lock(); head = page_cache_prev_miss(mapping, index - 1, max); rcu_read_unlock(); return index - 1 - head; } /* * page cache context based readahead */ static int try_context_readahead(struct address_space *mapping, struct file_ra_state *ra, pgoff_t index, unsigned long req_size, unsigned long max) { pgoff_t size; size = count_history_pages(mapping, index, max); /* * not enough history pages: * it could be a random read */ if (size <= req_size) return 0; /* * starts from beginning of file: * it is a strong indication of long-run stream (or whole-file-read) */ if (size >= index) size *= 2; ra->start = index; ra->size = min(size + req_size, max); ra->async_size = 1; return 1; } static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) { int err; struct folio *folio = filemap_alloc_folio(gfp, order); if (!folio) return -ENOMEM; mark = round_down(mark, 1UL << order); if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); if (err) { folio_put(folio); return err; } ractl->_nr_pages += 1UL << order; ractl->_workingset |= folio_test_workingset(folio); return 0; } void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra, unsigned int new_order) { struct address_space *mapping = ractl->mapping; pgoff_t index = readahead_index(ractl); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); if (!mapping_large_folio_support(mapping) || ra->size < 4) goto fallback; limit = min(limit, index + ra->size - 1); if (new_order < MAX_PAGECACHE_ORDER) { new_order += 2; new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); new_order = min_t(unsigned int, new_order, ilog2(ra->size)); } /* See comment in page_cache_ra_unbounded() */ nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); while (index <= limit) { unsigned int order = new_order; /* Align with smaller pages if needed */ if (index & ((1UL << order) - 1)) order = __ffs(index); /* Don't allocate pages past EOF */ while (index + (1UL << order) - 1 > limit) order--; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) break; index += 1UL << order; } if (index > limit) { ra->size += index - limit - 1; ra->async_size += index - limit - 1; } read_pages(ractl); filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); /* * If there were already pages in the page cache, then we may have * left some gaps. Let the regular readahead code take care of this * situation. */ if (!err) return; fallback: do_page_cache_ra(ractl, ra->size, ra->async_size); } /* * A minimal readahead algorithm for trivial sequential/random reads. */ static void ondemand_readahead(struct readahead_control *ractl, struct folio *folio, unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); struct file_ra_state *ra = ractl->ra; unsigned long max_pages = ra->ra_pages; unsigned long add_pages; pgoff_t index = readahead_index(ractl); pgoff_t expected, prev_index; unsigned int order = folio ? folio_order(folio) : 0; /* * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ if (req_size > max_pages && bdi->io_pages > max_pages) max_pages = min(req_size, bdi->io_pages); /* * start of file */ if (!index) goto initial_readahead; /* * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ expected = round_down(ra->start + ra->size - ra->async_size, 1UL << order); if (index == expected || index == (ra->start + ra->size)) { ra->start += ra->size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit; } /* * Hit a marked folio without valid readahead state. * E.g. interleaved reads. * Query the pagecache for async_size, which normally equals to * readahead size. Ramp it up and use it as the new readahead size. */ if (folio) { pgoff_t start; rcu_read_lock(); start = page_cache_next_miss(ractl->mapping, index + 1, max_pages); rcu_read_unlock(); if (!start || start - index > max_pages) return; ra->start = start; ra->size = start - index; /* old async_size */ ra->size += req_size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit; } /* * oversize read */ if (req_size > max_pages) goto initial_readahead; /* * sequential cache miss * trivial case: (index - prev_index) == 1 * unaligned reads: (index - prev_index) == 0 */ prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; if (index - prev_index <= 1UL) goto initial_readahead; /* * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ if (try_context_readahead(ractl->mapping, ra, index, req_size, max_pages)) goto readit; /* * standalone, small random read * Read as is, and do not pollute the readahead state. */ do_page_cache_ra(ractl, req_size, 0); return; initial_readahead: ra->start = index; ra->size = get_init_ra_size(req_size, max_pages); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; readit: /* * Will this read hit the readahead marker made by itself? * If so, trigger the readahead marker hit now, and merge * the resulted next readahead window into the current one. * Take care of maximum IO pages as above. */ if (index == ra->start && ra->size == ra->async_size) { add_pages = get_next_ra_size(ra, max_pages); if (ra->size + add_pages <= max_pages) { ra->async_size = add_pages; ra->size += add_pages; } else { ra->size = max_pages; ra->async_size = max_pages >> 1; } } ractl->_index = ra->start; page_cache_ra_order(ractl, ra, order); } void page_cache_sync_ra(struct readahead_control *ractl, unsigned long req_count) { bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); /* * Even if readahead is disabled, issue this request as readahead * as we'll need it to satisfy the requested range. The forced * readahead will do the right thing and limit the read to just the * requested range, which we'll set to 1 page for this case. */ if (!ractl->ra->ra_pages || blk_cgroup_congested()) { if (!ractl->file) return; req_count = 1; do_forced_ra = true; } /* be dumb */ if (do_forced_ra) { force_page_cache_ra(ractl, req_count); return; } ondemand_readahead(ractl, NULL, req_count); } EXPORT_SYMBOL_GPL(page_cache_sync_ra); void page_cache_async_ra(struct readahead_control *ractl, struct folio *folio, unsigned long req_count) { /* no readahead */ if (!ractl->ra->ra_pages) return; /* * Same bit is used for PG_readahead and PG_reclaim. */ if (folio_test_writeback(folio)) return; folio_clear_readahead(folio); if (blk_cgroup_congested()) return; ondemand_readahead(ractl, folio, req_count); } EXPORT_SYMBOL_GPL(page_cache_async_ra); ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { ssize_t ret; struct fd f; ret = -EBADF; f = fdget(fd); if (!f.file || !(f.file->f_mode & FMODE_READ)) goto out; /* * The readahead() syscall is intended to run only on files * that can execute readahead. If readahead is not possible * on this file, then we must return -EINVAL. */ ret = -EINVAL; if (!f.file->f_mapping || !f.file->f_mapping->a_ops || (!S_ISREG(file_inode(f.file)->i_mode) && !S_ISBLK(file_inode(f.file)->i_mode))) goto out; ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED); out: fdput(f); return ret; } SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) { return ksys_readahead(fd, offset, count); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_READAHEAD) COMPAT_SYSCALL_DEFINE4(readahead, int, fd, compat_arg_u64_dual(offset), size_t, count) { return ksys_readahead(fd, compat_arg_u64_glue(offset), count); } #endif /** * readahead_expand - Expand a readahead request * @ractl: The request to be expanded * @new_start: The revised start * @new_len: The revised size of the request * * Attempt to expand a readahead request outwards from the current size to the * specified size by inserting locked pages before and after the current window * to increase the size to the new window. This may involve the insertion of * THPs, in which case the window may get expanded even beyond what was * requested. * * The algorithm will stop if it encounters a conflicting page already in the * pagecache and leave a smaller expansion than requested. * * The caller must check for this by examining the revised @ractl object for a * different expansion than was requested. */ void readahead_expand(struct readahead_control *ractl, loff_t new_start, size_t new_len) { struct address_space *mapping = ractl->mapping; struct file_ra_state *ra = ractl->ra; pgoff_t new_index, new_nr_pages; gfp_t gfp_mask = readahead_gfp_mask(mapping); new_index = new_start / PAGE_SIZE; /* Expand the leading edge downwards */ while (ractl->_index > new_index) { unsigned long index = ractl->_index - 1; struct folio *folio = xa_load(&mapping->i_pages, index); if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ folio = filemap_alloc_folio(gfp_mask, 0); if (!folio) return; if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; } if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages++; ractl->_index = folio->index; } new_len += new_start - readahead_pos(ractl); new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE); /* Expand the trailing edge upwards */ while (ractl->_nr_pages < new_nr_pages) { unsigned long index = ractl->_index + ractl->_nr_pages; struct folio *folio = xa_load(&mapping->i_pages, index); if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ folio = filemap_alloc_folio(gfp_mask, 0); if (!folio) return; if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; } if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages++; if (ra) { ra->size++; ra->async_size++; } } } EXPORT_SYMBOL(readahead_expand);
40 37 24 24 24 52 53 53 53 9 36 46 1 3 8 12 23 23 7 12 23 6 26 2 1 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _XFRM_HASH_H #define _XFRM_HASH_H #include <linux/xfrm.h> #include <linux/socket.h> #include <linux/jhash.h> static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr) { return ntohl(addr->a4); } static inline unsigned int __xfrm6_addr_hash(const xfrm_address_t *addr) { return jhash2((__force u32 *)addr->a6, 4, 0); } static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr) { u32 sum = (__force u32)daddr->a4 + (__force u32)saddr->a4; return ntohl((__force __be32)sum); } static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr) { return __xfrm6_addr_hash(daddr) ^ __xfrm6_addr_hash(saddr); } static inline u32 __bits2mask32(__u8 bits) { u32 mask32 = 0xffffffff; if (bits == 0) mask32 = 0; else if (bits < 32) mask32 <<= (32 - bits); return mask32; } static inline unsigned int __xfrm4_dpref_spref_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, __u8 dbits, __u8 sbits) { return jhash_2words(ntohl(daddr->a4) & __bits2mask32(dbits), ntohl(saddr->a4) & __bits2mask32(sbits), 0); } static inline unsigned int __xfrm6_pref_hash(const xfrm_address_t *addr, __u8 prefixlen) { unsigned int pdw; unsigned int pbi; u32 initval = 0; pdw = prefixlen >> 5; /* num of whole u32 in prefix */ pbi = prefixlen & 0x1f; /* num of bits in incomplete u32 in prefix */ if (pbi) { __be32 mask; mask = htonl((0xffffffff) << (32 - pbi)); initval = (__force u32)(addr->a6[pdw] & mask); } return jhash2((__force u32 *)addr->a6, pdw, initval); } static inline unsigned int __xfrm6_dpref_spref_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, __u8 dbits, __u8 sbits) { return __xfrm6_pref_hash(daddr, dbits) ^ __xfrm6_pref_hash(saddr, sbits); } static inline unsigned int __xfrm_dst_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, u32 reqid, unsigned short family, unsigned int hmask) { unsigned int h = family ^ reqid; switch (family) { case AF_INET: h ^= __xfrm4_daddr_saddr_hash(daddr, saddr); break; case AF_INET6: h ^= __xfrm6_daddr_saddr_hash(daddr, saddr); break; } return (h ^ (h >> 16)) & hmask; } static inline unsigned int __xfrm_src_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, unsigned int hmask) { unsigned int h = family; switch (family) { case AF_INET: h ^= __xfrm4_daddr_saddr_hash(daddr, saddr); break; case AF_INET6: h ^= __xfrm6_daddr_saddr_hash(daddr, saddr); break; } return (h ^ (h >> 16)) & hmask; } static inline unsigned int __xfrm_spi_hash(const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family, unsigned int hmask) { unsigned int h = (__force u32)spi ^ proto; switch (family) { case AF_INET: h ^= __xfrm4_addr_hash(daddr); break; case AF_INET6: h ^= __xfrm6_addr_hash(daddr); break; } return (h ^ (h >> 10) ^ (h >> 20)) & hmask; } static inline unsigned int __xfrm_seq_hash(u32 seq, unsigned int hmask) { unsigned int h = seq; return (h ^ (h >> 10) ^ (h >> 20)) & hmask; } static inline unsigned int __idx_hash(u32 index, unsigned int hmask) { return (index ^ (index >> 8)) & hmask; } static inline unsigned int __sel_hash(const struct xfrm_selector *sel, unsigned short family, unsigned int hmask, u8 dbits, u8 sbits) { const xfrm_address_t *daddr = &sel->daddr; const xfrm_address_t *saddr = &sel->saddr; unsigned int h = 0; switch (family) { case AF_INET: if (sel->prefixlen_d < dbits || sel->prefixlen_s < sbits) return hmask + 1; h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits); break; case AF_INET6: if (sel->prefixlen_d < dbits || sel->prefixlen_s < sbits) return hmask + 1; h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits); break; } h ^= (h >> 16); return h & hmask; } static inline unsigned int __addr_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, unsigned int hmask, u8 dbits, u8 sbits) { unsigned int h = 0; switch (family) { case AF_INET: h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits); break; case AF_INET6: h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits); break; } h ^= (h >> 16); return h & hmask; } struct hlist_head *xfrm_hash_alloc(unsigned int sz); void xfrm_hash_free(struct hlist_head *n, unsigned int sz); #endif /* _XFRM_HASH_H */
1 1 1 1 1 1 1 1 1 4 4 4 4 4 4 4 3 4 4 4 4 4 2 2 2 2 1 2 4 4 1 4 3 2 1 4 4 1 1 1 1 133 133 132 3 3 3 3 3 2 3 3 3 1 1 1 1 1 1 1 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/nospec.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "tctx.h" #include "poll.h" #include "timeout.h" #include "waitid.h" #include "futex.h" #include "cancel.h" struct io_cancel { struct file *file; u64 addr; u32 flags; s32 fd; u8 opcode; }; #define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED | \ IORING_ASYNC_CANCEL_USERDATA | IORING_ASYNC_CANCEL_OP) /* * Returns true if the request matches the criteria outlined by 'cd'. */ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd) { bool match_user_data = cd->flags & IORING_ASYNC_CANCEL_USERDATA; if (req->ctx != cd->ctx) return false; if (!(cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP))) match_user_data = true; if (cd->flags & IORING_ASYNC_CANCEL_ANY) goto check_seq; if (cd->flags & IORING_ASYNC_CANCEL_FD) { if (req->file != cd->file) return false; } if (cd->flags & IORING_ASYNC_CANCEL_OP) { if (req->opcode != cd->opcode) return false; } if (match_user_data && req->cqe.user_data != cd->data) return false; if (cd->flags & IORING_ASYNC_CANCEL_ALL) { check_seq: if (io_cancel_match_sequence(req, cd->seq)) return false; } return true; } static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_cancel_data *cd = data; return io_cancel_req_match(req, cd); } static int io_async_cancel_one(struct io_uring_task *tctx, struct io_cancel_data *cd) { enum io_wq_cancel cancel_ret; int ret = 0; bool all; if (!tctx || !tctx->io_wq) return -ENOENT; all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; break; case IO_WQ_CANCEL_RUNNING: ret = -EALREADY; break; case IO_WQ_CANCEL_NOTFOUND: ret = -ENOENT; break; } return ret; } int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, unsigned issue_flags) { struct io_ring_ctx *ctx = cd->ctx; int ret; WARN_ON_ONCE(!io_wq_current_is_worker() && tctx != current->io_uring); ret = io_async_cancel_one(tctx, cd); /* * Fall-through even for -EALREADY, as we may have poll armed * that need unarming. */ if (!ret) return 0; ret = io_poll_cancel(ctx, cd, issue_flags); if (ret != -ENOENT) return ret; ret = io_waitid_cancel(ctx, cd, issue_flags); if (ret != -ENOENT) return ret; ret = io_futex_cancel(ctx, cd, issue_flags); if (ret != -ENOENT) return ret; spin_lock(&ctx->completion_lock); if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) ret = io_timeout_cancel(ctx, cd); spin_unlock(&ctx->completion_lock); return ret; } int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_cancel *cancel = io_kiocb_to_cmd(req, struct io_cancel); if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (sqe->off || sqe->splice_fd_in) return -EINVAL; cancel->addr = READ_ONCE(sqe->addr); cancel->flags = READ_ONCE(sqe->cancel_flags); if (cancel->flags & ~CANCEL_FLAGS) return -EINVAL; if (cancel->flags & IORING_ASYNC_CANCEL_FD) { if (cancel->flags & IORING_ASYNC_CANCEL_ANY) return -EINVAL; cancel->fd = READ_ONCE(sqe->fd); } if (cancel->flags & IORING_ASYNC_CANCEL_OP) { if (cancel->flags & IORING_ASYNC_CANCEL_ANY) return -EINVAL; cancel->opcode = READ_ONCE(sqe->len); } return 0; } static int __io_async_cancel(struct io_cancel_data *cd, struct io_uring_task *tctx, unsigned int issue_flags) { bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); struct io_ring_ctx *ctx = cd->ctx; struct io_tctx_node *node; int ret, nr = 0; do { ret = io_try_cancel(tctx, cd, issue_flags); if (ret == -ENOENT) break; if (!all) return ret; nr++; } while (1); /* slow path, try all io-wq's */ io_ring_submit_lock(ctx, issue_flags); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { ret = io_async_cancel_one(node->task->io_uring, cd); if (ret != -ENOENT) { if (!all) break; nr++; } } io_ring_submit_unlock(ctx, issue_flags); return all ? nr : ret; } int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) { struct io_cancel *cancel = io_kiocb_to_cmd(req, struct io_cancel); struct io_cancel_data cd = { .ctx = req->ctx, .data = cancel->addr, .flags = cancel->flags, .opcode = cancel->opcode, .seq = atomic_inc_return(&req->ctx->cancel_seq), }; struct io_uring_task *tctx = req->task->io_uring; int ret; if (cd.flags & IORING_ASYNC_CANCEL_FD) { if (req->flags & REQ_F_FIXED_FILE || cd.flags & IORING_ASYNC_CANCEL_FD_FIXED) { req->flags |= REQ_F_FIXED_FILE; req->file = io_file_get_fixed(req, cancel->fd, issue_flags); } else { req->file = io_file_get_normal(req, cancel->fd); } if (!req->file) { ret = -EBADF; goto done; } cd.file = req->file; } ret = __io_async_cancel(&cd, tctx, issue_flags); done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; } void init_hash_table(struct io_hash_table *table, unsigned size) { unsigned int i; for (i = 0; i < size; i++) { spin_lock_init(&table->hbs[i].lock); INIT_HLIST_HEAD(&table->hbs[i].list); } } static int __io_sync_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, int fd) { struct io_ring_ctx *ctx = cd->ctx; /* fixed must be grabbed every time since we drop the uring_lock */ if ((cd->flags & IORING_ASYNC_CANCEL_FD) && (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) { if (unlikely(fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); cd->file = io_file_from_index(&ctx->file_table, fd); if (!cd->file) return -EBADF; } return __io_async_cancel(cd, tctx, 0); } int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) __must_hold(&ctx->uring_lock) { struct io_cancel_data cd = { .ctx = ctx, .seq = atomic_inc_return(&ctx->cancel_seq), }; ktime_t timeout = KTIME_MAX; struct io_uring_sync_cancel_reg sc; struct file *file = NULL; DEFINE_WAIT(wait); int ret, i; if (copy_from_user(&sc, arg, sizeof(sc))) return -EFAULT; if (sc.flags & ~CANCEL_FLAGS) return -EINVAL; for (i = 0; i < ARRAY_SIZE(sc.pad); i++) if (sc.pad[i]) return -EINVAL; for (i = 0; i < ARRAY_SIZE(sc.pad2); i++) if (sc.pad2[i]) return -EINVAL; cd.data = sc.addr; cd.flags = sc.flags; cd.opcode = sc.opcode; /* we can grab a normal file descriptor upfront */ if ((cd.flags & IORING_ASYNC_CANCEL_FD) && !(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) { file = fget(sc.fd); if (!file) return -EBADF; cd.file = file; } ret = __io_sync_cancel(current->io_uring, &cd, sc.fd); /* found something, done! */ if (ret != -EALREADY) goto out; if (sc.timeout.tv_sec != -1UL || sc.timeout.tv_nsec != -1UL) { struct timespec64 ts = { .tv_sec = sc.timeout.tv_sec, .tv_nsec = sc.timeout.tv_nsec }; timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); } /* * Keep looking until we get -ENOENT. we'll get woken everytime * every time a request completes and will retry the cancelation. */ do { cd.seq = atomic_inc_return(&ctx->cancel_seq); prepare_to_wait(&ctx->cq_wait, &wait, TASK_INTERRUPTIBLE); ret = __io_sync_cancel(current->io_uring, &cd, sc.fd); mutex_unlock(&ctx->uring_lock); if (ret != -EALREADY) break; ret = io_run_task_work_sig(ctx); if (ret < 0) break; ret = schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS); if (!ret) { ret = -ETIME; break; } mutex_lock(&ctx->uring_lock); } while (1); finish_wait(&ctx->cq_wait, &wait); mutex_lock(&ctx->uring_lock); if (ret == -ENOENT || ret > 0) ret = 0; out: if (file) fput(file); return ret; }
3111 22 1889 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 /* * include/linux/topology.h * * Written by: Matthew Dobson, IBM Corporation * * Copyright (C) 2002, IBM Corp. * * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or * NON INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Send feedback to <colpatch@us.ibm.com> */ #ifndef _LINUX_TOPOLOGY_H #define _LINUX_TOPOLOGY_H #include <linux/arch_topology.h> #include <linux/cpumask.h> #include <linux/bitops.h> #include <linux/mmzone.h> #include <linux/smp.h> #include <linux/percpu.h> #include <asm/topology.h> #ifndef nr_cpus_node #define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node)) #endif #define for_each_node_with_cpus(node) \ for_each_online_node(node) \ if (nr_cpus_node(node)) int arch_update_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 #define REMOTE_DISTANCE 20 #define DISTANCE_BITS 8 #ifndef node_distance #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) #endif #ifndef RECLAIM_DISTANCE /* * If the distance between nodes in a system is larger than RECLAIM_DISTANCE * (in whatever arch specific measurement units returned by node_distance()) * and node_reclaim_mode is enabled then the VM will only call node_reclaim() * on nodes within this distance. */ #define RECLAIM_DISTANCE 30 #endif /* * The following tunable allows platforms to override the default node * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are * sufficiently fast that the default value actually hurts * performance. * * AMD EPYC machines use this because even though the 2-hop distance * is 32 (3.2x slower than a local memory access) performance actually * *improves* if allowed to reclaim memory and load balance tasks * between NUMA nodes 2-hops apart. */ extern int __read_mostly node_reclaim_distance; #ifndef PENALTY_FOR_NODE_WITH_CPUS #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DECLARE_PER_CPU(int, numa_node); #ifndef numa_node_id /* Returns the number of the current Node. */ static inline int numa_node_id(void) { return raw_cpu_read(numa_node); } #endif #ifndef cpu_to_node static inline int cpu_to_node(int cpu) { return per_cpu(numa_node, cpu); } #endif #ifndef set_numa_node static inline void set_numa_node(int node) { this_cpu_write(numa_node, node); } #endif #ifndef set_cpu_numa_node static inline void set_cpu_numa_node(int cpu, int node) { per_cpu(numa_node, cpu) = node; } #endif #else /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */ /* Returns the number of the current Node. */ #ifndef numa_node_id static inline int numa_node_id(void) { return cpu_to_node(raw_smp_processor_id()); } #endif #endif /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */ #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem(). */ DECLARE_PER_CPU(int, _numa_mem_); #ifndef set_numa_mem static inline void set_numa_mem(int node) { this_cpu_write(_numa_mem_, node); } #endif #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return raw_cpu_read(_numa_mem_); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return per_cpu(_numa_mem_, cpu); } #endif #ifndef set_cpu_numa_mem static inline void set_cpu_numa_mem(int cpu, int node) { per_cpu(_numa_mem_, cpu) = node; } #endif #else /* !CONFIG_HAVE_MEMORYLESS_NODES */ #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return numa_node_id(); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return cpu_to_node(cpu); } #endif #endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */ #if defined(topology_die_id) && defined(topology_die_cpumask) #define TOPOLOGY_DIE_SYSFS #endif #if defined(topology_cluster_id) && defined(topology_cluster_cpumask) #define TOPOLOGY_CLUSTER_SYSFS #endif #if defined(topology_book_id) && defined(topology_book_cpumask) #define TOPOLOGY_BOOK_SYSFS #endif #if defined(topology_drawer_id) && defined(topology_drawer_cpumask) #define TOPOLOGY_DRAWER_SYSFS #endif #ifndef topology_physical_package_id #define topology_physical_package_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_die_id #define topology_die_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_cluster_id #define topology_cluster_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_core_id #define topology_core_id(cpu) ((void)(cpu), 0) #endif #ifndef topology_book_id #define topology_book_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_drawer_id #define topology_drawer_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_ppin #define topology_ppin(cpu) ((void)(cpu), 0ull) #endif #ifndef topology_sibling_cpumask #define topology_sibling_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_core_cpumask #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_cluster_cpumask #define topology_cluster_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_die_cpumask #define topology_die_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_book_cpumask #define topology_book_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_drawer_cpumask #define topology_drawer_cpumask(cpu) cpumask_of(cpu) #endif #if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask) static inline const struct cpumask *cpu_smt_mask(int cpu) { return topology_sibling_cpumask(cpu); } #endif static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } #ifdef CONFIG_NUMA int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node); extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops); #else static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) { return cpumask_nth_and(cpu, cpus, cpu_online_mask); } static inline const struct cpumask * sched_numa_hop_mask(unsigned int node, unsigned int hops) { return ERR_PTR(-EOPNOTSUPP); } #endif /* CONFIG_NUMA */ /** * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance * from a given node. * @mask: the iteration variable. * @node: the NUMA node to start the search from. * * Requires rcu_lock to be held. * * Yields cpu_online_mask for @node == NUMA_NO_NODE. */ #define for_each_numa_hop_mask(mask, node) \ for (unsigned int __hops = 0; \ mask = (node != NUMA_NO_NODE || __hops) ? \ sched_numa_hop_mask(node, __hops) : \ cpu_online_mask, \ !IS_ERR_OR_NULL(mask); \ __hops++) #endif /* _LINUX_TOPOLOGY_H */
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 // SPDX-License-Identifier: GPL-2.0-or-later /* xfrm4_protocol.c - Generic xfrm protocol multiplexer. * * Copyright (C) 2013 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> * * Based on: * net/ipv4/tunnel4.c */ #include <linux/init.h> #include <linux/mutex.h> #include <linux/skbuff.h> #include <net/icmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/xfrm.h> static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly; static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly; static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly; static DEFINE_MUTEX(xfrm4_protocol_mutex); static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol) { switch (protocol) { case IPPROTO_ESP: return &esp4_handlers; case IPPROTO_AH: return &ah4_handlers; case IPPROTO_COMP: return &ipcomp4_handlers; } return NULL; } #define for_each_protocol_rcu(head, handler) \ for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ static int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) { int ret; struct xfrm4_protocol *handler; struct xfrm4_protocol __rcu **head = proto_handlers(protocol); if (!head) return 0; for_each_protocol_rcu(*head, handler) if ((ret = handler->cb_handler(skb, err)) <= 0) return ret; return 0; } int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { int ret; struct xfrm4_protocol *handler; struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); if (!head) goto out; if (!skb_dst(skb)) { const struct iphdr *iph = ip_hdr(skb); if (ip_route_input_noref(skb, iph->daddr, iph->saddr, iph->tos, skb->dev)) goto drop; } for_each_protocol_rcu(*head, handler) if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) return ret; out: icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } EXPORT_SYMBOL(xfrm4_rcv_encap); static int xfrm4_esp_rcv(struct sk_buff *skb) { int ret; struct xfrm4_protocol *handler; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; for_each_protocol_rcu(esp4_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) return ret; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); kfree_skb(skb); return 0; } static int xfrm4_esp_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(esp4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } static int xfrm4_ah_rcv(struct sk_buff *skb) { int ret; struct xfrm4_protocol *handler; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; for_each_protocol_rcu(ah4_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) return ret; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); kfree_skb(skb); return 0; } static int xfrm4_ah_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(ah4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } static int xfrm4_ipcomp_rcv(struct sk_buff *skb) { int ret; struct xfrm4_protocol *handler; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; for_each_protocol_rcu(ipcomp4_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) return ret; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); kfree_skb(skb); return 0; } static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(ipcomp4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } static const struct net_protocol esp4_protocol = { .handler = xfrm4_esp_rcv, .err_handler = xfrm4_esp_err, .no_policy = 1, }; static const struct net_protocol ah4_protocol = { .handler = xfrm4_ah_rcv, .err_handler = xfrm4_ah_err, .no_policy = 1, }; static const struct net_protocol ipcomp4_protocol = { .handler = xfrm4_ipcomp_rcv, .err_handler = xfrm4_ipcomp_err, .no_policy = 1, }; static const struct xfrm_input_afinfo xfrm4_input_afinfo = { .family = AF_INET, .callback = xfrm4_rcv_cb, }; static inline const struct net_protocol *netproto(unsigned char protocol) { switch (protocol) { case IPPROTO_ESP: return &esp4_protocol; case IPPROTO_AH: return &ah4_protocol; case IPPROTO_COMP: return &ipcomp4_protocol; } return NULL; } int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol) { struct xfrm4_protocol __rcu **pprev; struct xfrm4_protocol *t; bool add_netproto = false; int ret = -EEXIST; int priority = handler->priority; if (!proto_handlers(protocol) || !netproto(protocol)) return -EINVAL; mutex_lock(&xfrm4_protocol_mutex); if (!rcu_dereference_protected(*proto_handlers(protocol), lockdep_is_held(&xfrm4_protocol_mutex))) add_netproto = true; for (pprev = proto_handlers(protocol); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; pprev = &t->next) { if (t->priority < priority) break; if (t->priority == priority) goto err; } handler->next = *pprev; rcu_assign_pointer(*pprev, handler); ret = 0; err: mutex_unlock(&xfrm4_protocol_mutex); if (add_netproto) { if (inet_add_protocol(netproto(protocol), protocol)) { pr_err("%s: can't add protocol\n", __func__); ret = -EAGAIN; } } return ret; } EXPORT_SYMBOL(xfrm4_protocol_register); int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char protocol) { struct xfrm4_protocol __rcu **pprev; struct xfrm4_protocol *t; int ret = -ENOENT; if (!proto_handlers(protocol) || !netproto(protocol)) return -EINVAL; mutex_lock(&xfrm4_protocol_mutex); for (pprev = proto_handlers(protocol); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; pprev = &t->next) { if (t == handler) { *pprev = handler->next; ret = 0; break; } } if (!rcu_dereference_protected(*proto_handlers(protocol), lockdep_is_held(&xfrm4_protocol_mutex))) { if (inet_del_protocol(netproto(protocol), protocol) < 0) { pr_err("%s: can't remove protocol\n", __func__); ret = -EAGAIN; } } mutex_unlock(&xfrm4_protocol_mutex); synchronize_net(); return ret; } EXPORT_SYMBOL(xfrm4_protocol_deregister); void __init xfrm4_protocol_init(void) { xfrm_input_register_afinfo(&xfrm4_input_afinfo); }
92 12 4 50 10 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * * This file is part of the SCTP kernel implementation * * These functions implement the SCTP primitive functions from Section 10. * * Note that the descriptions from the specification are USER level * functions--this file is the functions which populate the struct proto * for SCTP which is the BOTTOM of the sockets interface. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Narasimha Budihal <narasimha@refcode.org> * Karl Knutson <karl@athena.chicago.il.us> * Ardelle Fan <ardelle.fan@intel.com> * Kevin Gao <kevin.gao@intel.com> */ #include <linux/types.h> #include <linux/list.h> /* For struct list_head */ #include <linux/socket.h> #include <linux/ip.h> #include <linux/time.h> /* For struct timeval */ #include <linux/gfp.h> #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #define DECLARE_PRIMITIVE(name) \ /* This is called in the code as sctp_primitive_ ## name. */ \ int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \ void *arg) { \ int error = 0; \ enum sctp_event_type event_type; union sctp_subtype subtype; \ enum sctp_state state; \ struct sctp_endpoint *ep; \ \ event_type = SCTP_EVENT_T_PRIMITIVE; \ subtype = SCTP_ST_PRIMITIVE(SCTP_PRIMITIVE_ ## name); \ state = asoc ? asoc->state : SCTP_STATE_CLOSED; \ ep = asoc ? asoc->ep : NULL; \ \ error = sctp_do_sm(net, event_type, subtype, state, ep, asoc, \ arg, GFP_KERNEL); \ return error; \ } /* 10.1 ULP-to-SCTP * B) Associate * * Format: ASSOCIATE(local SCTP instance name, destination transport addr, * outbound stream count) * -> association id [,destination transport addr list] [,outbound stream * count] * * This primitive allows the upper layer to initiate an association to a * specific peer endpoint. * * This version assumes that asoc is fully populated with the initial * parameters. We then return a traditional kernel indicator of * success or failure. */ /* This is called in the code as sctp_primitive_ASSOCIATE. */ DECLARE_PRIMITIVE(ASSOCIATE) /* 10.1 ULP-to-SCTP * C) Shutdown * * Format: SHUTDOWN(association id) * -> result * * Gracefully closes an association. Any locally queued user data * will be delivered to the peer. The association will be terminated only * after the peer acknowledges all the SCTP packets sent. A success code * will be returned on successful termination of the association. If * attempting to terminate the association results in a failure, an error * code shall be returned. */ DECLARE_PRIMITIVE(SHUTDOWN); /* 10.1 ULP-to-SCTP * C) Abort * * Format: Abort(association id [, cause code]) * -> result * * Ungracefully closes an association. Any locally queued user data * will be discarded and an ABORT chunk is sent to the peer. A success * code will be returned on successful abortion of the association. If * attempting to abort the association results in a failure, an error * code shall be returned. */ DECLARE_PRIMITIVE(ABORT); /* 10.1 ULP-to-SCTP * E) Send * * Format: SEND(association id, buffer address, byte count [,context] * [,stream id] [,life time] [,destination transport address] * [,unorder flag] [,no-bundle flag] [,payload protocol-id] ) * -> result * * This is the main method to send user data via SCTP. * * Mandatory attributes: * * o association id - local handle to the SCTP association * * o buffer address - the location where the user message to be * transmitted is stored; * * o byte count - The size of the user data in number of bytes; * * Optional attributes: * * o context - an optional 32 bit integer that will be carried in the * sending failure notification to the ULP if the transportation of * this User Message fails. * * o stream id - to indicate which stream to send the data on. If not * specified, stream 0 will be used. * * o life time - specifies the life time of the user data. The user data * will not be sent by SCTP after the life time expires. This * parameter can be used to avoid efforts to transmit stale * user messages. SCTP notifies the ULP if the data cannot be * initiated to transport (i.e. sent to the destination via SCTP's * send primitive) within the life time variable. However, the * user data will be transmitted if SCTP has attempted to transmit a * chunk before the life time expired. * * o destination transport address - specified as one of the destination * transport addresses of the peer endpoint to which this packet * should be sent. Whenever possible, SCTP should use this destination * transport address for sending the packets, instead of the current * primary path. * * o unorder flag - this flag, if present, indicates that the user * would like the data delivered in an unordered fashion to the peer * (i.e., the U flag is set to 1 on all DATA chunks carrying this * message). * * o no-bundle flag - instructs SCTP not to bundle this user data with * other outbound DATA chunks. SCTP MAY still bundle even when * this flag is present, when faced with network congestion. * * o payload protocol-id - A 32 bit unsigned integer that is to be * passed to the peer indicating the type of payload protocol data * being transmitted. This value is passed as opaque data by SCTP. */ DECLARE_PRIMITIVE(SEND); /* 10.1 ULP-to-SCTP * J) Request Heartbeat * * Format: REQUESTHEARTBEAT(association id, destination transport address) * * -> result * * Instructs the local endpoint to perform a HeartBeat on the specified * destination transport address of the given association. The returned * result should indicate whether the transmission of the HEARTBEAT * chunk to the destination address is successful. * * Mandatory attributes: * * o association id - local handle to the SCTP association * * o destination transport address - the transport address of the * association on which a heartbeat should be issued. */ DECLARE_PRIMITIVE(REQUESTHEARTBEAT); /* ADDIP * 3.1.1 Address Configuration Change Chunk (ASCONF) * * This chunk is used to communicate to the remote endpoint one of the * configuration change requests that MUST be acknowledged. The * information carried in the ASCONF Chunk uses the form of a * Type-Length-Value (TLV), as described in "3.2.1 Optional/ * Variable-length Parameter Format" in RFC2960 [5], forall variable * parameters. */ DECLARE_PRIMITIVE(ASCONF); /* RE-CONFIG 5.1 */ DECLARE_PRIMITIVE(RECONF);
13 15 4 1 23 17 9 11 9 3 3 3 2 1 3 1 3 76 28 69 63 13 15 86 8 41 51 122 6 4 168 12 12 4 167 167 165 219 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_NF_TABLES_H #define _NET_NF_TABLES_H #include <asm/unaligned.h> #include <linux/list.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/nf_tables.h> #include <linux/u64_stats_sync.h> #include <linux/rhashtable.h> #include <net/netfilter/nf_flow_table.h> #include <net/netlink.h> #include <net/flow_offload.h> #include <net/netns/generic.h> #define NFT_MAX_HOOKS (NF_INET_INGRESS + 1) struct module; #define NFT_JUMP_STACK_SIZE 16 enum { NFT_PKTINFO_L4PROTO = (1 << 0), NFT_PKTINFO_INNER = (1 << 1), NFT_PKTINFO_INNER_FULL = (1 << 2), }; struct nft_pktinfo { struct sk_buff *skb; const struct nf_hook_state *state; u8 flags; u8 tprot; u16 fragoff; u16 thoff; u16 inneroff; }; static inline struct sock *nft_sk(const struct nft_pktinfo *pkt) { return pkt->state->sk; } static inline unsigned int nft_thoff(const struct nft_pktinfo *pkt) { return pkt->thoff; } static inline struct net *nft_net(const struct nft_pktinfo *pkt) { return pkt->state->net; } static inline unsigned int nft_hook(const struct nft_pktinfo *pkt) { return pkt->state->hook; } static inline u8 nft_pf(const struct nft_pktinfo *pkt) { return pkt->state->pf; } static inline const struct net_device *nft_in(const struct nft_pktinfo *pkt) { return pkt->state->in; } static inline const struct net_device *nft_out(const struct nft_pktinfo *pkt) { return pkt->state->out; } static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, struct sk_buff *skb, const struct nf_hook_state *state) { pkt->skb = skb; pkt->state = state; } static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt) { pkt->flags = 0; pkt->tprot = 0; pkt->thoff = 0; pkt->fragoff = 0; } /** * struct nft_verdict - nf_tables verdict * * @code: nf_tables/netfilter verdict code * @chain: destination chain for NFT_JUMP/NFT_GOTO */ struct nft_verdict { u32 code; struct nft_chain *chain; }; struct nft_data { union { u32 data[4]; struct nft_verdict verdict; }; } __attribute__((aligned(__alignof__(u64)))); #define NFT_REG32_NUM 20 /** * struct nft_regs - nf_tables register set * * @data: data registers * @verdict: verdict register * * The first four data registers alias to the verdict register. */ struct nft_regs { union { u32 data[NFT_REG32_NUM]; struct nft_verdict verdict; }; }; struct nft_regs_track { struct { const struct nft_expr *selector; const struct nft_expr *bitwise; u8 num_reg; } regs[NFT_REG32_NUM]; const struct nft_expr *cur; const struct nft_expr *last; }; /* Store/load an u8, u16 or u64 integer to/from the u32 data register. * * Note, when using concatenations, register allocation happens at 32-bit * level. So for store instruction, pad the rest part with zero to avoid * garbage values. */ static inline void nft_reg_store8(u32 *dreg, u8 val) { *dreg = 0; *(u8 *)dreg = val; } static inline u8 nft_reg_load8(const u32 *sreg) { return *(u8 *)sreg; } static inline void nft_reg_store16(u32 *dreg, u16 val) { *dreg = 0; *(u16 *)dreg = val; } static inline void nft_reg_store_be16(u32 *dreg, __be16 val) { nft_reg_store16(dreg, (__force __u16)val); } static inline u16 nft_reg_load16(const u32 *sreg) { return *(u16 *)sreg; } static inline __be16 nft_reg_load_be16(const u32 *sreg) { return (__force __be16)nft_reg_load16(sreg); } static inline __be32 nft_reg_load_be32(const u32 *sreg) { return *(__force __be32 *)sreg; } static inline void nft_reg_store64(u64 *dreg, u64 val) { put_unaligned(val, dreg); } static inline u64 nft_reg_load64(const u32 *sreg) { return get_unaligned((u64 *)sreg); } static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { if (len % NFT_REG32_SIZE) dst[len / NFT_REG32_SIZE] = 0; memcpy(dst, src, len); } /** * struct nft_ctx - nf_tables rule/set context * * @net: net namespace * @table: the table the chain is contained in * @chain: the chain the rule is contained in * @nla: netlink attributes * @portid: netlink portID of the original message * @seq: netlink sequence number * @flags: modifiers to new request * @family: protocol family * @level: depth of the chains * @report: notify via unicast netlink message */ struct nft_ctx { struct net *net; struct nft_table *table; struct nft_chain *chain; const struct nlattr * const *nla; u32 portid; u32 seq; u16 flags; u8 family; u8 level; bool report; }; enum nft_data_desc_flags { NFT_DATA_DESC_SETELEM = (1 << 0), }; struct nft_data_desc { enum nft_data_types type; unsigned int size; unsigned int len; unsigned int flags; }; int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla); void nft_data_hold(const struct nft_data *data, enum nft_data_types type); void nft_data_release(const struct nft_data *data, enum nft_data_types type); int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, enum nft_data_types type, unsigned int len); static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg) { return reg == NFT_REG_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; } static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) { return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE; } int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg); int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len); int nft_parse_register_store(const struct nft_ctx *ctx, const struct nlattr *attr, u8 *dreg, const struct nft_data *data, enum nft_data_types type, unsigned int len); /** * struct nft_userdata - user defined data associated with an object * * @len: length of the data * @data: content * * The presence of user data is indicated in an object specific fashion, * so a length of zero can't occur and the value "len" indicates data * of length len + 1. */ struct nft_userdata { u8 len; unsigned char data[]; }; /* placeholder structure for opaque set element backend representation. */ struct nft_elem_priv { }; /** * struct nft_set_elem - generic representation of set elements * * @key: element key * @key_end: closing element key * @data: element data * @priv: element private data and extensions */ struct nft_set_elem { union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key_end; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } data; struct nft_elem_priv *priv; }; static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) { return (void *)priv; } /** * enum nft_iter_type - nftables set iterator type * * @NFT_ITER_READ: read-only iteration over set elements * @NFT_ITER_UPDATE: iteration under mutex to update set element state */ enum nft_iter_type { NFT_ITER_UNSPEC, NFT_ITER_READ, NFT_ITER_UPDATE, }; struct nft_set; struct nft_set_iter { u8 genmask; enum nft_iter_type type:8; unsigned int count; unsigned int skip; int err; int (*fn)(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv); }; /** * struct nft_set_desc - description of set elements * * @ktype: key type * @klen: key length * @dtype: data type * @dlen: data length * @objtype: object type * @size: number of set elements * @policy: set policy * @gc_int: garbage collector interval * @timeout: element timeout * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @expr: set must support for expressions */ struct nft_set_desc { u32 ktype; unsigned int klen; u32 dtype; unsigned int dlen; u32 objtype; unsigned int size; u32 policy; u32 gc_int; u64 timeout; u8 field_len[NFT_REG32_COUNT]; u8 field_count; bool expr; }; /** * enum nft_set_class - performance class * * @NFT_SET_CLASS_O_1: constant, O(1) * @NFT_SET_CLASS_O_LOG_N: logarithmic, O(log N) * @NFT_SET_CLASS_O_N: linear, O(N) */ enum nft_set_class { NFT_SET_CLASS_O_1, NFT_SET_CLASS_O_LOG_N, NFT_SET_CLASS_O_N, }; /** * struct nft_set_estimate - estimation of memory and performance * characteristics * * @size: required memory * @lookup: lookup performance class * @space: memory class */ struct nft_set_estimate { u64 size; enum nft_set_class lookup; enum nft_set_class space; }; #define NFT_EXPR_MAXATTR 16 #define NFT_EXPR_SIZE(size) (sizeof(struct nft_expr) + \ ALIGN(size, __alignof__(struct nft_expr))) /** * struct nft_expr - nf_tables expression * * @ops: expression ops * @data: expression private data */ struct nft_expr { const struct nft_expr_ops *ops; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_expr_priv(const struct nft_expr *expr) { return (void *)expr->data; } struct nft_expr_info; int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, struct nft_expr_info *info); int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr, bool reset); bool nft_expr_reduce_bitwise(struct nft_regs_track *track, const struct nft_expr *expr); struct nft_set_ext; /** * struct nft_set_ops - nf_tables set operations * * @lookup: look up an element within the set * @update: update an element if exists, add it if doesn't exist * @delete: delete an element * @insert: insert new element into set * @activate: activate new element in the next generation * @deactivate: lookup for element and deactivate it in the next generation * @flush: deactivate element in the next generation * @remove: remove element from set * @walk: iterate over all set elements * @get: get set elements * @commit: commit set elements * @abort: abort set elements * @privsize: function to return size of set private data * @estimate: estimate the required memory size and the lookup complexity class * @init: initialize private data of new set instance * @destroy: destroy private data of set instance * @gc_init: initialize garbage collection * @elemsize: element private size * * Operations lookup, update and delete have simpler interfaces, are faster * and currently only used in the packet path. All the rest are slower, * control plane functions. */ struct nft_set_ops { bool (*lookup)(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext); bool (*update)(struct nft_set *set, const u32 *key, struct nft_elem_priv * (*new)(struct nft_set *, const struct nft_expr *, struct nft_regs *), const struct nft_expr *expr, struct nft_regs *regs, const struct nft_set_ext **ext); bool (*delete)(const struct nft_set *set, const u32 *key); int (*insert)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **priv); void (*activate)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); struct nft_elem_priv * (*deactivate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); void (*flush)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *priv); void (*remove)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); void (*walk)(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter); struct nft_elem_priv * (*get)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); void (*commit)(struct nft_set *set); void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); bool (*estimate)(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); int (*init)(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]); void (*destroy)(const struct nft_ctx *ctx, const struct nft_set *set); void (*gc_init)(const struct nft_set *set); unsigned int elemsize; }; /** * struct nft_set_type - nf_tables set type * * @ops: set ops for this type * @features: features supported by the implementation */ struct nft_set_type { const struct nft_set_ops ops; u32 features; }; #define to_set_type(o) container_of(o, struct nft_set_type, ops) struct nft_set_elem_expr { u8 size; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; #define nft_setelem_expr_at(__elem_expr, __offset) \ ((struct nft_expr *)&__elem_expr->data[__offset]) #define nft_setelem_expr_foreach(__expr, __elem_expr, __size) \ for (__expr = nft_setelem_expr_at(__elem_expr, 0), __size = 0; \ __size < (__elem_expr)->size; \ __size += (__expr)->ops->size, __expr = ((void *)(__expr)) + (__expr)->ops->size) #define NFT_SET_EXPR_MAX 2 /** * struct nft_set - nf_tables set instance * * @list: table set list node * @bindings: list of set bindings * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set * @handle: unique handle of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) * @dtype: data type (verdict or numeric type defined by userspace) * @objtype: object type (see NFT_OBJECT_* definitions) * @size: maximum set size * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal * @timeout: default timeout value in jiffies * @gc_int: garbage collection interval in msecs * @policy: set parameterization (see enum nft_set_policies) * @udlen: user data length * @udata: user data * @pending_update: list of pending update set element * @ops: set ops * @flags: set flags * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length * @num_exprs: numbers of exprs * @exprs: stateful expression * @catchall_list: list of catch-all set element * @data: private set data */ struct nft_set { struct list_head list; struct list_head bindings; refcount_t refs; struct nft_table *table; possible_net_t net; char *name; u64 handle; u32 ktype; u32 dtype; u32 objtype; u32 size; u8 field_len[NFT_REG32_COUNT]; u8 field_count; u32 use; atomic_t nelems; u32 ndeact; u64 timeout; u32 gc_int; u16 policy; u16 udlen; unsigned char *udata; struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags:13, dead:1, genmask:2; u8 klen; u8 dlen; u8 num_exprs; struct nft_expr *exprs[NFT_SET_EXPR_MAX]; struct list_head catchall_list; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline bool nft_set_is_anonymous(const struct nft_set *set) { return set->flags & NFT_SET_ANONYMOUS; } static inline void *nft_set_priv(const struct nft_set *set) { return (void *)set->data; } static inline enum nft_data_types nft_set_datatype(const struct nft_set *set) { return set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; } static inline bool nft_set_gc_is_pending(const struct nft_set *s) { return refcount_read(&s->refs) != 1; } static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); } struct nft_set *nft_set_lookup_global(const struct net *net, const struct nft_table *table, const struct nlattr *nla_set_name, const struct nlattr *nla_set_id, u8 genmask); struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, const struct nft_set *set); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { u32 gc_int = READ_ONCE(set->gc_int); return gc_int ? msecs_to_jiffies(gc_int) : HZ; } /** * struct nft_set_binding - nf_tables set binding * * @list: set bindings list node * @chain: chain containing the rule bound to the set * @flags: set action flags * * A set binding contains all information necessary for validation * of new elements added to a bound set. */ struct nft_set_binding { struct list_head list; const struct nft_chain *chain; u32 flags; }; enum nft_trans_phase; void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set); void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase); int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); /** * enum nft_set_extensions - set extension type IDs * * @NFT_SET_EXT_KEY: element key * @NFT_SET_EXT_KEY_END: upper bound element key, for ranges * @NFT_SET_EXT_DATA: mapping data * @NFT_SET_EXT_FLAGS: element flags * @NFT_SET_EXT_TIMEOUT: element timeout * @NFT_SET_EXT_EXPIRATION: element expiration time * @NFT_SET_EXT_USERDATA: user data associated with the element * @NFT_SET_EXT_EXPRESSIONS: expressions assiciated with the element * @NFT_SET_EXT_OBJREF: stateful object reference associated with element * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { NFT_SET_EXT_KEY, NFT_SET_EXT_KEY_END, NFT_SET_EXT_DATA, NFT_SET_EXT_FLAGS, NFT_SET_EXT_TIMEOUT, NFT_SET_EXT_EXPIRATION, NFT_SET_EXT_USERDATA, NFT_SET_EXT_EXPRESSIONS, NFT_SET_EXT_OBJREF, NFT_SET_EXT_NUM }; /** * struct nft_set_ext_type - set extension type * * @len: fixed part length of the extension * @align: alignment requirements of the extension */ struct nft_set_ext_type { u8 len; u8 align; }; extern const struct nft_set_ext_type nft_set_ext_types[]; /** * struct nft_set_ext_tmpl - set extension template * * @len: length of extension area * @offset: offsets of individual extension types * @ext_len: length of the expected extension(used to sanity check) */ struct nft_set_ext_tmpl { u16 len; u8 offset[NFT_SET_EXT_NUM]; u8 ext_len[NFT_SET_EXT_NUM]; }; /** * struct nft_set_ext - set extensions * * @genmask: generation mask * @offset: offsets of individual extension types * @data: beginning of extension data */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; }; static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { memset(tmpl, 0, sizeof(*tmpl)); tmpl->len = sizeof(struct nft_set_ext); } static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id, unsigned int len) { tmpl->len = ALIGN(tmpl->len, nft_set_ext_types[id].align); if (tmpl->len > U8_MAX) return -EINVAL; tmpl->offset[id] = tmpl->len; tmpl->ext_len[id] = nft_set_ext_types[id].len + len; tmpl->len += tmpl->ext_len[id]; return 0; } static inline int nft_set_ext_add(struct nft_set_ext_tmpl *tmpl, u8 id) { return nft_set_ext_add_length(tmpl, id, 0); } static inline void nft_set_ext_init(struct nft_set_ext *ext, const struct nft_set_ext_tmpl *tmpl) { memcpy(ext->offset, tmpl->offset, sizeof(ext->offset)); } static inline bool __nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return !!ext->offset[id]; } static inline bool nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return ext && __nft_set_ext_exists(ext, id); } static inline void *nft_set_ext(const struct nft_set_ext *ext, u8 id) { return (void *)ext + ext->offset[id]; } static inline struct nft_data *nft_set_ext_key(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY); } static inline struct nft_data *nft_set_ext_key_end(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY_END); } static inline struct nft_data *nft_set_ext_data(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_DATA); } static inline u8 *nft_set_ext_flags(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_FLAGS); } static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); } static inline u64 *nft_set_ext_expiration(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); } static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_USERDATA); } static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS); } static inline bool __nft_set_elem_expired(const struct nft_set_ext *ext, u64 tstamp) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && time_after_eq64(tstamp, *nft_set_ext_expiration(ext)); } static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return __nft_set_elem_expired(ext, get_jiffies_64()); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, const struct nft_elem_priv *elem_priv) { return (void *)elem_priv + set->ops->elemsize; } static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_OBJREF); } struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr); struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *key_end, const u32 *data, u64 timeout, u64 expiration, gfp_t gfp); int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]); void nft_set_elem_destroy(const struct nft_set *set, const struct nft_elem_priv *elem_priv, bool destroy_expr); void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_elem_priv *elem_priv); struct nft_expr_ops; /** * struct nft_expr_type - nf_tables expression type * * @select_ops: function to select nft_expr_ops * @release_ops: release nft_expr_ops * @ops: default ops, used when no select_ops functions is present * @inner_ops: inner ops, used for inner packet operation * @list: used internally * @name: Identifier * @owner: module reference * @policy: netlink attribute policy * @maxattr: highest netlink attribute number * @family: address family for AF-specific types * @flags: expression type flags */ struct nft_expr_type { const struct nft_expr_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); void (*release_ops)(const struct nft_expr_ops *ops); const struct nft_expr_ops *ops; const struct nft_expr_ops *inner_ops; struct list_head list; const char *name; struct module *owner; const struct nla_policy *policy; unsigned int maxattr; u8 family; u8 flags; }; #define NFT_EXPR_STATEFUL 0x1 #define NFT_EXPR_GC 0x2 enum nft_trans_phase { NFT_TRANS_PREPARE, NFT_TRANS_PREPARE_ERROR, NFT_TRANS_ABORT, NFT_TRANS_COMMIT, NFT_TRANS_RELEASE }; struct nft_flow_rule; struct nft_offload_ctx; /** * struct nft_expr_ops - nf_tables expression operations * * @eval: Expression evaluation function * @clone: Expression clone function * @size: full expression size, including private data size * @init: initialization function * @activate: activate expression in the next generation * @deactivate: deactivate expression in next generation * @destroy: destruction function, called after synchronize_rcu * @destroy_clone: destruction clone function * @dump: function to dump parameters * @validate: validate expression, called during loop detection * @reduce: reduce expression * @gc: garbage collection expression * @offload: hardware offload expression * @offload_action: function to report true/false to allocate one slot or not in the flow * offload array * @offload_stats: function to synchronize hardware stats via updating the counter expression * @type: expression type * @data: extra data to attach to this expression operation */ struct nft_expr_ops { void (*eval)(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); int (*clone)(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); void (*activate)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*deactivate)(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase); void (*destroy)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*destroy_clone)(const struct nft_ctx *ctx, const struct nft_expr *expr); int (*dump)(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data); bool (*reduce)(struct nft_regs_track *track, const struct nft_expr *expr); bool (*gc)(struct net *net, const struct nft_expr *expr); int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); bool (*offload_action)(const struct nft_expr *expr); void (*offload_stats)(struct nft_expr *expr, const struct flow_stats *stats); const struct nft_expr_type *type; void *data; }; /** * struct nft_rule - nf_tables rule * * @list: used internally * @handle: rule handle * @genmask: generation mask * @dlen: length of expression data * @udata: user data is appended to the rule * @data: expression data */ struct nft_rule { struct list_head list; u64 handle:42, genmask:2, dlen:12, udata:1; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[0]; } static inline struct nft_expr *nft_expr_next(const struct nft_expr *expr) { return ((void *)expr) + expr->ops->size; } static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[rule->dlen]; } static inline bool nft_expr_more(const struct nft_rule *rule, const struct nft_expr *expr) { return expr != nft_expr_last(rule) && expr->ops; } static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) { return (void *)&rule->data[rule->dlen]; } void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule); void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, enum nft_trans_phase phase); void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule); static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_set_elem_expr *elem_expr; struct nft_expr *expr; u32 size; if (__nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) { elem_expr = nft_set_ext_expr(ext); nft_setelem_expr_foreach(expr, elem_expr, size) { expr->ops->eval(expr, regs, pkt); if (regs->verdict.code == NFT_BREAK) return; } } } /* * The last pointer isn't really necessary, but the compiler isn't able to * determine that the result of nft_expr_last() is always the same since it * can't assume that the dlen value wasn't changed within calls in the loop. */ #define nft_rule_for_each_expr(expr, last, rule) \ for ((expr) = nft_expr_first(rule), (last) = nft_expr_last(rule); \ (expr) != (last); \ (expr) = nft_expr_next(expr)) #define NFT_CHAIN_POLICY_UNSET U8_MAX struct nft_rule_dp { u64 is_last:1, dlen:12, handle:42; /* for tracing */ unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; struct nft_rule_dp_last { struct nft_rule_dp end; /* end of nft_rule_blob marker */ struct rcu_head h; /* call_rcu head */ struct nft_rule_blob *blob; /* ptr to free via call_rcu */ const struct nft_chain *chain; /* for nftables tracing */ }; static inline const struct nft_rule_dp *nft_rule_next(const struct nft_rule_dp *rule) { return (void *)rule + sizeof(*rule) + rule->dlen; } struct nft_rule_blob { unsigned long size; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_rule_dp)))); }; /** * struct nft_chain - nf_tables chain * * @blob_gen_0: rule blob pointer to the current generation * @blob_gen_1: rule blob pointer to the future generation * @rules: list of rules in the chain * @list: used internally * @rhlhead: used internally * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain * @flags: bitmask of enum NFTA_CHAIN_FLAGS * @bound: bind or not * @genmask: generation mask * @name: name of the chain * @udlen: user data length * @udata: user data in the chain * @blob_next: rule blob pointer to the next in the chain */ struct nft_chain { struct nft_rule_blob __rcu *blob_gen_0; struct nft_rule_blob __rcu *blob_gen_1; struct list_head rules; struct list_head list; struct rhlist_head rhlhead; struct nft_table *table; u64 handle; u32 use; u8 flags:5, bound:1, genmask:2; char *name; u16 udlen; u8 *udata; /* Only used during control plane commit phase: */ struct nft_rule_blob *blob_next; }; int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv); int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, NFT_CHAIN_T_NAT, NFT_CHAIN_T_MAX }; /** * struct nft_chain_type - nf_tables chain type info * * @name: name of the type * @type: numeric identifier * @family: address family * @owner: module owner * @hook_mask: mask of valid hooks * @hooks: array of hook functions * @ops_register: base chain register function * @ops_unregister: base chain unregister function */ struct nft_chain_type { const char *name; enum nft_chain_types type; int family; struct module *owner; unsigned int hook_mask; nf_hookfn *hooks[NFT_MAX_HOOKS]; int (*ops_register)(struct net *net, const struct nf_hook_ops *ops); void (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops); }; int nft_chain_validate_dependency(const struct nft_chain *chain, enum nft_chain_types type); int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); static inline bool nft_chain_binding(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BINDING; } static inline bool nft_chain_is_bound(struct nft_chain *chain) { return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; } int nft_chain_add(struct nft_table *table, struct nft_chain *chain); void nft_chain_del(struct nft_chain *chain); void nf_tables_chain_destroy(struct nft_ctx *ctx); struct nft_stats { u64 bytes; u64 pkts; struct u64_stats_sync syncp; }; struct nft_hook { struct list_head list; struct nf_hook_ops ops; struct rcu_head rcu; }; /** * struct nft_base_chain - nf_tables base chain * * @ops: netfilter hook ops * @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family) * @type: chain type * @policy: default policy * @flags: indicate the base chain disabled or not * @stats: per-cpu chain stats * @chain: the chain * @flow_block: flow block (for hardware offload) */ struct nft_base_chain { struct nf_hook_ops ops; struct list_head hook_list; const struct nft_chain_type *type; u8 policy; u8 flags; struct nft_stats __percpu *stats; struct nft_chain chain; struct flow_block flow_block; }; static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain) { return container_of(chain, struct nft_base_chain, chain); } static inline bool nft_is_base_chain(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BASE; } int __nft_release_basechain(struct nft_ctx *ctx); unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); static inline bool nft_use_inc(u32 *use) { if (*use == UINT_MAX) return false; (*use)++; return true; } static inline void nft_use_dec(u32 *use) { WARN_ON_ONCE((*use)-- == 0); } /* For error and abort path: restore use counter to previous state. */ static inline void nft_use_inc_restore(u32 *use) { WARN_ON_ONCE(!nft_use_inc(use)); } #define nft_use_dec_restore nft_use_dec /** * struct nft_table - nf_tables table * * @list: used internally * @chains_ht: chains in the table * @chains: same, for stable walks * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table * @family:address family * @flags: table flag (see enum nft_table_flags) * @genmask: generation mask * @nlpid: netlink port ID * @name: name of the table * @udlen: length of the user data * @udata: user data * @validate_state: internal, set when transaction adds jumps */ struct nft_table { struct list_head list; struct rhltable chains_ht; struct list_head chains; struct list_head sets; struct list_head objects; struct list_head flowtables; u64 hgenerator; u64 handle; u32 use; u16 family:6, flags:8, genmask:2; u32 nlpid; char *name; u16 udlen; u8 *udata; u8 validate_state; }; static inline bool nft_table_has_owner(const struct nft_table *table) { return table->flags & NFT_TABLE_F_OWNER; } static inline bool nft_table_is_orphan(const struct nft_table *table) { return (table->flags & (NFT_TABLE_F_OWNER | NFT_TABLE_F_PERSIST)) == NFT_TABLE_F_PERSIST; } static inline bool nft_base_chain_netdev(int family, u32 hooknum) { return family == NFPROTO_NETDEV || (family == NFPROTO_INET && hooknum == NF_INET_INGRESS); } void nft_register_chain_type(const struct nft_chain_type *); void nft_unregister_chain_type(const struct nft_chain_type *); int nft_register_expr(struct nft_expr_type *); void nft_unregister_expr(struct nft_expr_type *); int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v); /** * struct nft_object_hash_key - key to lookup nft_object * * @name: name of the stateful object to look up * @table: table the object belongs to */ struct nft_object_hash_key { const char *name; const struct nft_table *table; }; /** * struct nft_object - nf_tables stateful object * * @list: table stateful object list node * @rhlhead: nft_objname_ht node * @key: keys that identify this object * @genmask: generation mask * @use: number of references to this stateful object * @handle: unique object handle * @udlen: length of user data * @udata: user data * @ops: object operations * @data: object data, layout depends on type */ struct nft_object { struct list_head list; struct rhlist_head rhlhead; struct nft_object_hash_key key; u32 genmask:2; u32 use; u64 handle; u16 udlen; u8 *udata; /* runtime data below here */ const struct nft_object_ops *ops ____cacheline_aligned; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_obj_data(const struct nft_object *obj) { return (void *)obj->data; } #define nft_expr_obj(expr) *((struct nft_object **)nft_expr_priv(expr)) struct nft_object *nft_obj_lookup(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask); void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp); /** * struct nft_object_type - stateful object type * * @select_ops: function to select nft_object_ops * @ops: default ops, used when no select_ops functions is present * @list: list node in list of object types * @type: stateful object numeric type * @owner: module owner * @maxattr: maximum netlink attribute * @family: address family for AF-specific object types * @policy: netlink attribute policy */ struct nft_object_type { const struct nft_object_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); const struct nft_object_ops *ops; struct list_head list; u32 type; unsigned int maxattr; u8 family; struct module *owner; const struct nla_policy *policy; }; /** * struct nft_object_ops - stateful object operations * * @eval: stateful object evaluation function * @size: stateful object size * @init: initialize object from netlink attributes * @destroy: release existing stateful object * @dump: netlink dump stateful object * @update: update stateful object * @type: pointer to object type */ struct nft_object_ops { void (*eval)(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nlattr *const tb[], struct nft_object *obj); void (*destroy)(const struct nft_ctx *ctx, struct nft_object *obj); int (*dump)(struct sk_buff *skb, struct nft_object *obj, bool reset); void (*update)(struct nft_object *obj, struct nft_object *newobj); const struct nft_object_type *type; }; int nft_register_obj(struct nft_object_type *obj_type); void nft_unregister_obj(struct nft_object_type *obj_type); #define NFT_NETDEVICE_MAX 256 /** * struct nft_flowtable - nf_tables flow table * * @list: flow table list node in table list * @table: the table the flow table is contained in * @name: name of this flow table * @hooknum: hook number * @ops_len: number of hooks in array * @genmask: generation mask * @use: number of references to this flow table * @handle: unique object handle * @hook_list: hook list for hooks per net_device in flowtables * @data: rhashtable and garbage collector */ struct nft_flowtable { struct list_head list; struct nft_table *table; char *name; int hooknum; int ops_len; u32 genmask:2; u32 use; u64 handle; /* runtime data below here */ struct list_head hook_list ____cacheline_aligned; struct nf_flowtable data; }; struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, const struct nlattr *nla, u8 genmask); void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, struct nft_flowtable *flowtable, enum nft_trans_phase phase); void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); /** * struct nft_traceinfo - nft tracing information and state * * @trace: other struct members are initialised * @nf_trace: copy of skb->nf_trace before rule evaluation * @type: event type (enum nft_trace_types) * @skbid: hash of skb to be used as trace id * @packet_dumped: packet headers sent in a previous traceinfo message * @basechain: base chain currently processed */ struct nft_traceinfo { bool trace; bool nf_trace; bool packet_dumped; enum nft_trace_types type:8; u32 skbid; const struct nft_base_chain *basechain; }; void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, const struct nft_chain *basechain); void nft_trace_notify(const struct nft_pktinfo *pkt, const struct nft_verdict *verdict, const struct nft_rule_dp *rule, struct nft_traceinfo *info); #define MODULE_ALIAS_NFT_CHAIN(family, name) \ MODULE_ALIAS("nft-chain-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_AF_EXPR(family, name) \ MODULE_ALIAS("nft-expr-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_EXPR(name) \ MODULE_ALIAS("nft-expr-" name) #define MODULE_ALIAS_NFT_OBJ(type) \ MODULE_ALIAS("nft-obj-" __stringify(type)) #if IS_ENABLED(CONFIG_NF_TABLES) /* * The gencursor defines two generations, the currently active and the * next one. Objects contain a bitmask of 2 bits specifying the generations * they're active in. A set bit means they're inactive in the generation * represented by that bit. * * New objects start out as inactive in the current and active in the * next generation. When committing the ruleset the bitmask is cleared, * meaning they're active in all generations. When removing an object, * it is set inactive in the next generation. After committing the ruleset, * the objects are removed. */ static inline unsigned int nft_gencursor_next(const struct net *net) { return net->nft.gencursor + 1 == 1 ? 1 : 0; } static inline u8 nft_genmask_next(const struct net *net) { return 1 << nft_gencursor_next(net); } static inline u8 nft_genmask_cur(const struct net *net) { /* Use READ_ONCE() to prevent refetching the value for atomicity */ return 1 << READ_ONCE(net->nft.gencursor); } #define NFT_GENMASK_ANY ((1 << 0) | (1 << 1)) /* * Generic transaction helpers */ /* Check if this object is currently active. */ #define nft_is_active(__net, __obj) \ (((__obj)->genmask & nft_genmask_cur(__net)) == 0) /* Check if this object is active in the next generation. */ #define nft_is_active_next(__net, __obj) \ (((__obj)->genmask & nft_genmask_next(__net)) == 0) /* This object becomes active in the next generation. */ #define nft_activate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_cur(__net) /* This object becomes inactive in the next generation. */ #define nft_deactivate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_next(__net) /* After committing the ruleset, clear the stale generation bit. */ #define nft_clear(__net, __obj) \ (__obj)->genmask &= ~nft_genmask_next(__net) #define nft_active_genmask(__obj, __genmask) \ !((__obj)->genmask & __genmask) /* * Set element transaction helpers */ static inline bool nft_set_elem_active(const struct nft_set_ext *ext, u8 genmask) { return !(ext->genmask & genmask); } static inline void nft_set_elem_change_active(const struct net *net, const struct nft_set *set, struct nft_set_ext *ext) { ext->genmask ^= nft_genmask_next(net); } #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ #define NFT_SET_ELEM_DEAD_MASK (1 << 2) #if defined(__LITTLE_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT 2 #elif defined(__BIG_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #else #error #endif static inline void nft_set_elem_dead(struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); set_bit(NFT_SET_ELEM_DEAD_BIT, word); } static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); return test_bit(NFT_SET_ELEM_DEAD_BIT, word); } /** * struct nft_trans - nf_tables object update in transaction * * @list: used internally * @binding_list: list of objects with possible bindings * @msg_type: message type * @put_net: ctx->net needs to be put * @ctx: transaction context * @data: internal information related to the transaction */ struct nft_trans { struct list_head list; struct list_head binding_list; int msg_type; bool put_net; struct nft_ctx ctx; char data[]; }; struct nft_trans_rule { struct nft_rule *rule; struct nft_flow_rule *flow; u32 rule_id; bool bound; }; #define nft_trans_rule(trans) \ (((struct nft_trans_rule *)trans->data)->rule) #define nft_trans_flow_rule(trans) \ (((struct nft_trans_rule *)trans->data)->flow) #define nft_trans_rule_id(trans) \ (((struct nft_trans_rule *)trans->data)->rule_id) #define nft_trans_rule_bound(trans) \ (((struct nft_trans_rule *)trans->data)->bound) struct nft_trans_set { struct nft_set *set; u32 set_id; u32 gc_int; u64 timeout; bool update; bool bound; u32 size; }; #define nft_trans_set(trans) \ (((struct nft_trans_set *)trans->data)->set) #define nft_trans_set_id(trans) \ (((struct nft_trans_set *)trans->data)->set_id) #define nft_trans_set_bound(trans) \ (((struct nft_trans_set *)trans->data)->bound) #define nft_trans_set_update(trans) \ (((struct nft_trans_set *)trans->data)->update) #define nft_trans_set_timeout(trans) \ (((struct nft_trans_set *)trans->data)->timeout) #define nft_trans_set_gc_int(trans) \ (((struct nft_trans_set *)trans->data)->gc_int) #define nft_trans_set_size(trans) \ (((struct nft_trans_set *)trans->data)->size) struct nft_trans_chain { struct nft_chain *chain; bool update; char *name; struct nft_stats __percpu *stats; u8 policy; bool bound; u32 chain_id; struct nft_base_chain *basechain; struct list_head hook_list; }; #define nft_trans_chain(trans) \ (((struct nft_trans_chain *)trans->data)->chain) #define nft_trans_chain_update(trans) \ (((struct nft_trans_chain *)trans->data)->update) #define nft_trans_chain_name(trans) \ (((struct nft_trans_chain *)trans->data)->name) #define nft_trans_chain_stats(trans) \ (((struct nft_trans_chain *)trans->data)->stats) #define nft_trans_chain_policy(trans) \ (((struct nft_trans_chain *)trans->data)->policy) #define nft_trans_chain_bound(trans) \ (((struct nft_trans_chain *)trans->data)->bound) #define nft_trans_chain_id(trans) \ (((struct nft_trans_chain *)trans->data)->chain_id) #define nft_trans_basechain(trans) \ (((struct nft_trans_chain *)trans->data)->basechain) #define nft_trans_chain_hooks(trans) \ (((struct nft_trans_chain *)trans->data)->hook_list) struct nft_trans_table { bool update; }; #define nft_trans_table_update(trans) \ (((struct nft_trans_table *)trans->data)->update) struct nft_trans_elem { struct nft_set *set; struct nft_elem_priv *elem_priv; bool bound; }; #define nft_trans_elem_set(trans) \ (((struct nft_trans_elem *)trans->data)->set) #define nft_trans_elem_priv(trans) \ (((struct nft_trans_elem *)trans->data)->elem_priv) #define nft_trans_elem_set_bound(trans) \ (((struct nft_trans_elem *)trans->data)->bound) struct nft_trans_obj { struct nft_object *obj; struct nft_object *newobj; bool update; }; #define nft_trans_obj(trans) \ (((struct nft_trans_obj *)trans->data)->obj) #define nft_trans_obj_newobj(trans) \ (((struct nft_trans_obj *)trans->data)->newobj) #define nft_trans_obj_update(trans) \ (((struct nft_trans_obj *)trans->data)->update) struct nft_trans_flowtable { struct nft_flowtable *flowtable; bool update; struct list_head hook_list; u32 flags; }; #define nft_trans_flowtable(trans) \ (((struct nft_trans_flowtable *)trans->data)->flowtable) #define nft_trans_flowtable_update(trans) \ (((struct nft_trans_flowtable *)trans->data)->update) #define nft_trans_flowtable_hooks(trans) \ (((struct nft_trans_flowtable *)trans->data)->hook_list) #define nft_trans_flowtable_flags(trans) \ (((struct nft_trans_flowtable *)trans->data)->flags) #define NFT_TRANS_GC_BATCHCOUNT 256 struct nft_trans_gc { struct list_head list; struct net *net; struct nft_set *set; u32 seq; u16 count; struct nft_elem_priv *priv[NFT_TRANS_GC_BATCHCOUNT]; struct rcu_head rcu; }; struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_destroy(struct nft_trans_gc *trans); struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, unsigned int gc_seq); struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc); void nft_setelem_data_deactivate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); void __init nft_chain_route_init(void); void nft_chain_route_fini(void); void nf_tables_trans_destroy_flush_work(void); int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result); __be64 nf_jiffies64_to_msecs(u64 input); #ifdef CONFIG_MODULES __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...); #else static inline int nft_request_module(struct net *net, const char *fmt, ...) { return -ENOENT; } #endif struct nftables_pernet { struct list_head tables; struct list_head commit_list; struct list_head binding_list; struct list_head module_list; struct list_head notify_list; struct mutex commit_mutex; u64 table_handle; u64 tstamp; unsigned int base_seq; unsigned int gc_seq; u8 validate_state; }; extern unsigned int nf_tables_net_id; static inline struct nftables_pernet *nft_pernet(const struct net *net) { return net_generic(net, nf_tables_net_id); } static inline u64 nft_net_tstamp(const struct net *net) { return nft_pernet(net)->tstamp; } #define __NFT_REDUCE_READONLY 1UL #define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) { return expr->ops->reduce == NFT_REDUCE_READONLY; } void nft_reg_track_update(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg, u8 len); void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg); static inline bool nft_reg_track_cmp(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg) { return track->regs[dreg].selector && track->regs[dreg].selector->ops == expr->ops && track->regs[dreg].num_reg == 0; } #endif /* _NET_NF_TABLES_H */
3 3 3 3 6 5 6 6 5 6 6 6 3 3 6 6 6 6 3 6 3 3 3 3 3 3 3 2 2 2 2 1 1 1 3 3 3 3 1 3 3 3 3 3 1 2 2 6 6 2 3 3 3 3 3 6 6 6 2 2 2 2 6 6 6 6 6 6 6 6 3 3 3 3 3 3 3 3 3 5 6 6 6 3 5 3 3 3 3 3 6 5 5 6 6 6 6 6 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/if_vlan.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/netdevice.h> #include <net/addrconf.h> #include <rdma/ib_cache.h> #include "core_priv.h" struct ib_pkey_cache { int table_len; u16 table[] __counted_by(table_len); }; struct ib_update_work { struct work_struct work; struct ib_event event; bool enforce_security; }; union ib_gid zgid; EXPORT_SYMBOL(zgid); enum gid_attr_find_mask { GID_ATTR_FIND_MASK_GID = 1UL << 0, GID_ATTR_FIND_MASK_NETDEV = 1UL << 1, GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2, GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3, }; enum gid_table_entry_state { GID_TABLE_ENTRY_INVALID = 1, GID_TABLE_ENTRY_VALID = 2, /* * Indicates that entry is pending to be removed, there may * be active users of this GID entry. * When last user of the GID entry releases reference to it, * GID entry is detached from the table. */ GID_TABLE_ENTRY_PENDING_DEL = 3, }; struct roce_gid_ndev_storage { struct rcu_head rcu_head; struct net_device *ndev; }; struct ib_gid_table_entry { struct kref kref; struct work_struct del_work; struct ib_gid_attr attr; void *context; /* Store the ndev pointer to release reference later on in * call_rcu context because by that time gid_table_entry * and attr might be already freed. So keep a copy of it. * ndev_storage is freed by rcu callback. */ struct roce_gid_ndev_storage *ndev_storage; enum gid_table_entry_state state; }; struct ib_gid_table { int sz; /* In RoCE, adding a GID to the table requires: * (a) Find if this GID is already exists. * (b) Find a free space. * (c) Write the new GID * * Delete requires different set of operations: * (a) Find the GID * (b) Delete it. * **/ /* Any writer to data_vec must hold this lock and the write side of * rwlock. Readers must hold only rwlock. All writers must be in a * sleepable context. */ struct mutex lock; /* rwlock protects data_vec[ix]->state and entry pointer. */ rwlock_t rwlock; struct ib_gid_table_entry **data_vec; /* bit field, each bit indicates the index of default GID */ u32 default_gid_indices; }; static void dispatch_gid_change_event(struct ib_device *ib_dev, u32 port) { struct ib_event event; event.device = ib_dev; event.element.port_num = port; event.event = IB_EVENT_GID_CHANGE; ib_dispatch_event_clients(&event); } static const char * const gid_type_str[] = { /* IB/RoCE v1 value is set for IB_GID_TYPE_IB and IB_GID_TYPE_ROCE for * user space compatibility reasons. */ [IB_GID_TYPE_IB] = "IB/RoCE v1", [IB_GID_TYPE_ROCE] = "IB/RoCE v1", [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2", }; const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) { if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type]) return gid_type_str[gid_type]; return "Invalid GID type"; } EXPORT_SYMBOL(ib_cache_gid_type_str); /** rdma_is_zero_gid - Check if given GID is zero or not. * @gid: GID to check * Returns true if given GID is zero, returns false otherwise. */ bool rdma_is_zero_gid(const union ib_gid *gid) { return !memcmp(gid, &zgid, sizeof(*gid)); } EXPORT_SYMBOL(rdma_is_zero_gid); /** is_gid_index_default - Check if a given index belongs to * reserved default GIDs or not. * @table: GID table pointer * @index: Index to check in GID table * Returns true if index is one of the reserved default GID index otherwise * returns false. */ static bool is_gid_index_default(const struct ib_gid_table *table, unsigned int index) { return index < 32 && (BIT(index) & table->default_gid_indices); } int ib_cache_gid_parse_type_str(const char *buf) { unsigned int i; size_t len; int err = -EINVAL; len = strlen(buf); if (len == 0) return -EINVAL; if (buf[len - 1] == '\n') len--; for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i) if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) && len == strlen(gid_type_str[i])) { err = i; break; } return err; } EXPORT_SYMBOL(ib_cache_gid_parse_type_str); static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u32 port) { return device->port_data[port].cache.gid; } static bool is_gid_entry_free(const struct ib_gid_table_entry *entry) { return !entry; } static bool is_gid_entry_valid(const struct ib_gid_table_entry *entry) { return entry && entry->state == GID_TABLE_ENTRY_VALID; } static void schedule_free_gid(struct kref *kref) { struct ib_gid_table_entry *entry = container_of(kref, struct ib_gid_table_entry, kref); queue_work(ib_wq, &entry->del_work); } static void put_gid_ndev(struct rcu_head *head) { struct roce_gid_ndev_storage *storage = container_of(head, struct roce_gid_ndev_storage, rcu_head); WARN_ON(!storage->ndev); /* At this point its safe to release netdev reference, * as all callers working on gid_attr->ndev are done * using this netdev. */ dev_put(storage->ndev); kfree(storage); } static void free_gid_entry_locked(struct ib_gid_table_entry *entry) { struct ib_device *device = entry->attr.device; u32 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); dev_dbg(&device->dev, "%s port=%u index=%u gid %pI6\n", __func__, port_num, entry->attr.index, entry->attr.gid.raw); write_lock_irq(&table->rwlock); /* * The only way to avoid overwriting NULL in table is * by comparing if it is same entry in table or not! * If new entry in table is added by the time we free here, * don't overwrite the table entry. */ if (entry == table->data_vec[entry->attr.index]) table->data_vec[entry->attr.index] = NULL; /* Now this index is ready to be allocated */ write_unlock_irq(&table->rwlock); if (entry->ndev_storage) call_rcu(&entry->ndev_storage->rcu_head, put_gid_ndev); kfree(entry); } static void free_gid_entry(struct kref *kref) { struct ib_gid_table_entry *entry = container_of(kref, struct ib_gid_table_entry, kref); free_gid_entry_locked(entry); } /** * free_gid_work - Release reference to the GID entry * @work: Work structure to refer to GID entry which needs to be * deleted. * * free_gid_work() frees the entry from the HCA's hardware table * if provider supports it. It releases reference to netdevice. */ static void free_gid_work(struct work_struct *work) { struct ib_gid_table_entry *entry = container_of(work, struct ib_gid_table_entry, del_work); struct ib_device *device = entry->attr.device; u32 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); mutex_lock(&table->lock); free_gid_entry_locked(entry); mutex_unlock(&table->lock); } static struct ib_gid_table_entry * alloc_gid_entry(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry; struct net_device *ndev; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return NULL; ndev = rcu_dereference_protected(attr->ndev, 1); if (ndev) { entry->ndev_storage = kzalloc(sizeof(*entry->ndev_storage), GFP_KERNEL); if (!entry->ndev_storage) { kfree(entry); return NULL; } dev_hold(ndev); entry->ndev_storage->ndev = ndev; } kref_init(&entry->kref); memcpy(&entry->attr, attr, sizeof(*attr)); INIT_WORK(&entry->del_work, free_gid_work); entry->state = GID_TABLE_ENTRY_INVALID; return entry; } static void store_gid_entry(struct ib_gid_table *table, struct ib_gid_table_entry *entry) { entry->state = GID_TABLE_ENTRY_VALID; dev_dbg(&entry->attr.device->dev, "%s port=%u index=%u gid %pI6\n", __func__, entry->attr.port_num, entry->attr.index, entry->attr.gid.raw); lockdep_assert_held(&table->lock); write_lock_irq(&table->rwlock); table->data_vec[entry->attr.index] = entry; write_unlock_irq(&table->rwlock); } static void get_gid_entry(struct ib_gid_table_entry *entry) { kref_get(&entry->kref); } static void put_gid_entry(struct ib_gid_table_entry *entry) { kref_put(&entry->kref, schedule_free_gid); } static void put_gid_entry_locked(struct ib_gid_table_entry *entry) { kref_put(&entry->kref, free_gid_entry); } static int add_roce_gid(struct ib_gid_table_entry *entry) { const struct ib_gid_attr *attr = &entry->attr; int ret; if (!attr->ndev) { dev_err(&attr->device->dev, "%s NULL netdev port=%u index=%u\n", __func__, attr->port_num, attr->index); return -EINVAL; } if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) { ret = attr->device->ops.add_gid(attr, &entry->context); if (ret) { dev_err(&attr->device->dev, "%s GID add failed port=%u index=%u\n", __func__, attr->port_num, attr->index); return ret; } } return 0; } /** * del_gid - Delete GID table entry * * @ib_dev: IB device whose GID entry to be deleted * @port: Port number of the IB device * @table: GID table of the IB device for a port * @ix: GID entry index to delete * */ static void del_gid(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table, int ix) { struct roce_gid_ndev_storage *ndev_storage; struct ib_gid_table_entry *entry; lockdep_assert_held(&table->lock); dev_dbg(&ib_dev->dev, "%s port=%u index=%d gid %pI6\n", __func__, port, ix, table->data_vec[ix]->attr.gid.raw); write_lock_irq(&table->rwlock); entry = table->data_vec[ix]; entry->state = GID_TABLE_ENTRY_PENDING_DEL; /* * For non RoCE protocol, GID entry slot is ready to use. */ if (!rdma_protocol_roce(ib_dev, port)) table->data_vec[ix] = NULL; write_unlock_irq(&table->rwlock); if (rdma_cap_roce_gid_table(ib_dev, port)) ib_dev->ops.del_gid(&entry->attr, &entry->context); ndev_storage = entry->ndev_storage; if (ndev_storage) { entry->ndev_storage = NULL; rcu_assign_pointer(entry->attr.ndev, NULL); call_rcu(&ndev_storage->rcu_head, put_gid_ndev); } put_gid_entry_locked(entry); } /** * add_modify_gid - Add or modify GID table entry * * @table: GID table in which GID to be added or modified * @attr: Attributes of the GID * * Returns 0 on success or appropriate error code. It accepts zero * GID addition for non RoCE ports for HCA's who report them as valid * GID. However such zero GIDs are not added to the cache. */ static int add_modify_gid(struct ib_gid_table *table, const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry; int ret = 0; /* * Invalidate any old entry in the table to make it safe to write to * this index. */ if (is_gid_entry_valid(table->data_vec[attr->index])) del_gid(attr->device, attr->port_num, table, attr->index); /* * Some HCA's report multiple GID entries with only one valid GID, and * leave other unused entries as the zero GID. Convert zero GIDs to * empty table entries instead of storing them. */ if (rdma_is_zero_gid(&attr->gid)) return 0; entry = alloc_gid_entry(attr); if (!entry) return -ENOMEM; if (rdma_protocol_roce(attr->device, attr->port_num)) { ret = add_roce_gid(entry); if (ret) goto done; } store_gid_entry(table, entry); return 0; done: put_gid_entry(entry); return ret; } /* rwlock should be read locked, or lock should be held */ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, const struct ib_gid_attr *val, bool default_gid, unsigned long mask, int *pempty) { int i = 0; int found = -1; int empty = pempty ? -1 : 0; while (i < table->sz && (found < 0 || empty < 0)) { struct ib_gid_table_entry *data = table->data_vec[i]; struct ib_gid_attr *attr; int curr_index = i; i++; /* find_gid() is used during GID addition where it is expected * to return a free entry slot which is not duplicate. * Free entry slot is requested and returned if pempty is set, * so lookup free slot only if requested. */ if (pempty && empty < 0) { if (is_gid_entry_free(data) && default_gid == is_gid_index_default(table, curr_index)) { /* * Found an invalid (free) entry; allocate it. * If default GID is requested, then our * found slot must be one of the DEFAULT * reserved slots or we fail. * This ensures that only DEFAULT reserved * slots are used for default property GIDs. */ empty = curr_index; } } /* * Additionally find_gid() is used to find valid entry during * lookup operation; so ignore the entries which are marked as * pending for removal and the entries which are marked as * invalid. */ if (!is_gid_entry_valid(data)) continue; if (found >= 0) continue; attr = &data->attr; if (mask & GID_ATTR_FIND_MASK_GID_TYPE && attr->gid_type != val->gid_type) continue; if (mask & GID_ATTR_FIND_MASK_GID && memcmp(gid, &data->attr.gid, sizeof(*gid))) continue; if (mask & GID_ATTR_FIND_MASK_NETDEV && attr->ndev != val->ndev) continue; if (mask & GID_ATTR_FIND_MASK_DEFAULT && is_gid_index_default(table, curr_index) != default_gid) continue; found = curr_index; } if (pempty) *pempty = empty; return found; } static void make_default_gid(struct net_device *dev, union ib_gid *gid) { gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); addrconf_ifid_eui48(&gid->raw[8], dev); } static int __ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr, unsigned long mask, bool default_gid) { struct ib_gid_table *table; int ret = 0; int empty; int ix; /* Do not allow adding zero GID in support of * IB spec version 1.3 section 4.1.1 point (6) and * section 12.7.10 and section 12.7.20 */ if (rdma_is_zero_gid(gid)) return -EINVAL; table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); ix = find_gid(table, gid, attr, default_gid, mask, &empty); if (ix >= 0) goto out_unlock; if (empty < 0) { ret = -ENOSPC; goto out_unlock; } attr->device = ib_dev; attr->index = empty; attr->port_num = port; attr->gid = *gid; ret = add_modify_gid(table, attr); if (!ret) dispatch_gid_change_event(ib_dev, port); out_unlock: mutex_unlock(&table->lock); if (ret) pr_warn("%s: unable to add gid %pI6 error=%d\n", __func__, gid->raw, ret); return ret; } int ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr) { unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_NETDEV; return __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false); } static int _ib_cache_gid_del(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr, unsigned long mask, bool default_gid) { struct ib_gid_table *table; int ret = 0; int ix; table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); ix = find_gid(table, gid, attr, default_gid, mask, NULL); if (ix < 0) { ret = -EINVAL; goto out_unlock; } del_gid(ib_dev, port, table, ix); dispatch_gid_change_event(ib_dev, port); out_unlock: mutex_unlock(&table->lock); if (ret) pr_debug("%s: can't delete gid %pI6 error=%d\n", __func__, gid->raw, ret); return ret; } int ib_cache_gid_del(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr) { unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_DEFAULT | GID_ATTR_FIND_MASK_NETDEV; return _ib_cache_gid_del(ib_dev, port, gid, attr, mask, false); } int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct ib_gid_table *table; int ix; bool deleted = false; table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); for (ix = 0; ix < table->sz; ix++) { if (is_gid_entry_valid(table->data_vec[ix]) && table->data_vec[ix]->attr.ndev == ndev) { del_gid(ib_dev, port, table, ix); deleted = true; } } mutex_unlock(&table->lock); if (deleted) dispatch_gid_change_event(ib_dev, port); return 0; } /** * rdma_find_gid_by_port - Returns the GID entry attributes when it finds * a valid GID entry for given search parameters. It searches for the specified * GID value in the local software cache. * @ib_dev: The device to query. * @gid: The GID value to search for. * @gid_type: The GID type to search for. * @port: The port number of the device where the GID value should be searched. * @ndev: In RoCE, the net device of the device. NULL means ignore. * * Returns sgid attributes if the GID is found with valid reference or * returns ERR_PTR for the error. * The caller must invoke rdma_put_gid_attr() to release the reference. */ const struct ib_gid_attr * rdma_find_gid_by_port(struct ib_device *ib_dev, const union ib_gid *gid, enum ib_gid_type gid_type, u32 port, struct net_device *ndev) { int local_index; struct ib_gid_table *table; unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type}; const struct ib_gid_attr *attr; unsigned long flags; if (!rdma_is_port_valid(ib_dev, port)) return ERR_PTR(-ENOENT); table = rdma_gid_table(ib_dev, port); if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; read_lock_irqsave(&table->rwlock, flags); local_index = find_gid(table, gid, &val, false, mask, NULL); if (local_index >= 0) { get_gid_entry(table->data_vec[local_index]); attr = &table->data_vec[local_index]->attr; read_unlock_irqrestore(&table->rwlock, flags); return attr; } read_unlock_irqrestore(&table->rwlock, flags); return ERR_PTR(-ENOENT); } EXPORT_SYMBOL(rdma_find_gid_by_port); /** * rdma_find_gid_by_filter - Returns the GID table attribute where a * specified GID value occurs * @ib_dev: The device to query. * @gid: The GID value to search for. * @port: The port number of the device where the GID value could be * searched. * @filter: The filter function is executed on any matching GID in the table. * If the filter function returns true, the corresponding index is returned, * otherwise, we continue searching the GID table. It's guaranteed that * while filter is executed, ndev field is valid and the structure won't * change. filter is executed in an atomic context. filter must not be NULL. * @context: Private data to pass into the call-back. * * rdma_find_gid_by_filter() searches for the specified GID value * of which the filter function returns true in the port's GID table. * */ const struct ib_gid_attr *rdma_find_gid_by_filter( struct ib_device *ib_dev, const union ib_gid *gid, u32 port, bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *, void *), void *context) { const struct ib_gid_attr *res = ERR_PTR(-ENOENT); struct ib_gid_table *table; unsigned long flags; unsigned int i; if (!rdma_is_port_valid(ib_dev, port)) return ERR_PTR(-EINVAL); table = rdma_gid_table(ib_dev, port); read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { struct ib_gid_table_entry *entry = table->data_vec[i]; if (!is_gid_entry_valid(entry)) continue; if (memcmp(gid, &entry->attr.gid, sizeof(*gid))) continue; if (filter(gid, &entry->attr, context)) { get_gid_entry(entry); res = &entry->attr; break; } } read_unlock_irqrestore(&table->rwlock, flags); return res; } static struct ib_gid_table *alloc_gid_table(int sz) { struct ib_gid_table *table = kzalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL); if (!table->data_vec) goto err_free_table; mutex_init(&table->lock); table->sz = sz; rwlock_init(&table->rwlock); return table; err_free_table: kfree(table); return NULL; } static void release_gid_table(struct ib_device *device, struct ib_gid_table *table) { bool leak = false; int i; if (!table) return; for (i = 0; i < table->sz; i++) { if (is_gid_entry_free(table->data_vec[i])) continue; if (kref_read(&table->data_vec[i]->kref) > 1) { dev_err(&device->dev, "GID entry ref leak for index %d ref=%u\n", i, kref_read(&table->data_vec[i]->kref)); leak = true; } } if (leak) return; mutex_destroy(&table->lock); kfree(table->data_vec); kfree(table); } static void cleanup_gid_table_port(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table) { int i; if (!table) return; mutex_lock(&table->lock); for (i = 0; i < table->sz; ++i) { if (is_gid_entry_valid(table->data_vec[i])) del_gid(ib_dev, port, table, i); } mutex_unlock(&table->lock); } void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port, struct net_device *ndev, unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode) { union ib_gid gid = { }; struct ib_gid_attr gid_attr; unsigned int gid_type; unsigned long mask; mask = GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_DEFAULT | GID_ATTR_FIND_MASK_NETDEV; memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) { if (1UL << gid_type & ~gid_type_mask) continue; gid_attr.gid_type = gid_type; if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { make_default_gid(ndev, &gid); __ib_cache_gid_add(ib_dev, port, &gid, &gid_attr, mask, true); } else if (mode == IB_CACHE_GID_DEFAULT_MODE_DELETE) { _ib_cache_gid_del(ib_dev, port, &gid, &gid_attr, mask, true); } } } static void gid_table_reserve_default(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table) { unsigned int i; unsigned long roce_gid_type_mask; unsigned int num_default_gids; roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port); num_default_gids = hweight_long(roce_gid_type_mask); /* Reserve starting indices for default GIDs */ for (i = 0; i < num_default_gids && i < table->sz; i++) table->default_gid_indices |= BIT(i); } static void gid_table_release_one(struct ib_device *ib_dev) { u32 p; rdma_for_each_port (ib_dev, p) { release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid); ib_dev->port_data[p].cache.gid = NULL; } } static int _gid_table_setup_one(struct ib_device *ib_dev) { struct ib_gid_table *table; u32 rdma_port; rdma_for_each_port (ib_dev, rdma_port) { table = alloc_gid_table( ib_dev->port_data[rdma_port].immutable.gid_tbl_len); if (!table) goto rollback_table_setup; gid_table_reserve_default(ib_dev, rdma_port, table); ib_dev->port_data[rdma_port].cache.gid = table; } return 0; rollback_table_setup: gid_table_release_one(ib_dev); return -ENOMEM; } static void gid_table_cleanup_one(struct ib_device *ib_dev) { u32 p; rdma_for_each_port (ib_dev, p) cleanup_gid_table_port(ib_dev, p, ib_dev->port_data[p].cache.gid); } static int gid_table_setup_one(struct ib_device *ib_dev) { int err; err = _gid_table_setup_one(ib_dev); if (err) return err; rdma_roce_rescan_device(ib_dev); return err; } /** * rdma_query_gid - Read the GID content from the GID software cache * @device: Device to query the GID * @port_num: Port number of the device * @index: Index of the GID table entry to read * @gid: Pointer to GID where to store the entry's GID * * rdma_query_gid() only reads the GID entry content for requested device, * port and index. It reads for IB, RoCE and iWarp link layers. It doesn't * hold any reference to the GID table entry in the HCA or software cache. * * Returns 0 on success or appropriate error code. * */ int rdma_query_gid(struct ib_device *device, u32 port_num, int index, union ib_gid *gid) { struct ib_gid_table *table; unsigned long flags; int res; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); if (index < 0 || index >= table->sz) { res = -EINVAL; goto done; } if (!is_gid_entry_valid(table->data_vec[index])) { res = -ENOENT; goto done; } memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid)); res = 0; done: read_unlock_irqrestore(&table->rwlock, flags); return res; } EXPORT_SYMBOL(rdma_query_gid); /** * rdma_read_gid_hw_context - Read the HW GID context from GID attribute * @attr: Potinter to the GID attribute * * rdma_read_gid_hw_context() reads the drivers GID HW context corresponding * to the SGID attr. Callers are required to already be holding the reference * to an existing GID entry. * * Returns the HW GID context * */ void *rdma_read_gid_hw_context(const struct ib_gid_attr *attr) { return container_of(attr, struct ib_gid_table_entry, attr)->context; } EXPORT_SYMBOL(rdma_read_gid_hw_context); /** * rdma_find_gid - Returns SGID attributes if the matching GID is found. * @device: The device to query. * @gid: The GID value to search for. * @gid_type: The GID type to search for. * @ndev: In RoCE, the net device of the device. NULL means ignore. * * rdma_find_gid() searches for the specified GID value in the software cache. * * Returns GID attributes if a valid GID is found or returns ERR_PTR for the * error. The caller must invoke rdma_put_gid_attr() to release the reference. * */ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, const union ib_gid *gid, enum ib_gid_type gid_type, struct net_device *ndev) { unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; u32 p; if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; rdma_for_each_port(device, p) { struct ib_gid_table *table; unsigned long flags; int index; table = device->port_data[p].cache.gid; read_lock_irqsave(&table->rwlock, flags); index = find_gid(table, gid, &gid_attr_val, false, mask, NULL); if (index >= 0) { const struct ib_gid_attr *attr; get_gid_entry(table->data_vec[index]); attr = &table->data_vec[index]->attr; read_unlock_irqrestore(&table->rwlock, flags); return attr; } read_unlock_irqrestore(&table->rwlock, flags); } return ERR_PTR(-ENOENT); } EXPORT_SYMBOL(rdma_find_gid); int ib_get_cached_pkey(struct ib_device *device, u32 port_num, int index, u16 *pkey) { struct ib_pkey_cache *cache; unsigned long flags; int ret = 0; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); cache = device->port_data[port_num].cache.pkey; if (!cache || index < 0 || index >= cache->table_len) ret = -EINVAL; else *pkey = cache->table[index]; read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_pkey); void ib_get_cached_subnet_prefix(struct ib_device *device, u32 port_num, u64 *sn_pfx) { unsigned long flags; read_lock_irqsave(&device->cache_lock, flags); *sn_pfx = device->port_data[port_num].cache.subnet_prefix; read_unlock_irqrestore(&device->cache_lock, flags); } EXPORT_SYMBOL(ib_get_cached_subnet_prefix); int ib_find_cached_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index) { struct ib_pkey_cache *cache; unsigned long flags; int i; int ret = -ENOENT; int partial_ix = -1; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); cache = device->port_data[port_num].cache.pkey; if (!cache) { ret = -EINVAL; goto err; } *index = -1; for (i = 0; i < cache->table_len; ++i) if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) { if (cache->table[i] & 0x8000) { *index = i; ret = 0; break; } else { partial_ix = i; } } if (ret && partial_ix >= 0) { *index = partial_ix; ret = 0; } err: read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_find_cached_pkey); int ib_find_exact_cached_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index) { struct ib_pkey_cache *cache; unsigned long flags; int i; int ret = -ENOENT; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); cache = device->port_data[port_num].cache.pkey; if (!cache) { ret = -EINVAL; goto err; } *index = -1; for (i = 0; i < cache->table_len; ++i) if (cache->table[i] == pkey) { *index = i; ret = 0; break; } err: read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_find_exact_cached_pkey); int ib_get_cached_lmc(struct ib_device *device, u32 port_num, u8 *lmc) { unsigned long flags; int ret = 0; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); *lmc = device->port_data[port_num].cache.lmc; read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_lmc); int ib_get_cached_port_state(struct ib_device *device, u32 port_num, enum ib_port_state *port_state) { unsigned long flags; int ret = 0; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); *port_state = device->port_data[port_num].cache.port_state; read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_port_state); /** * rdma_get_gid_attr - Returns GID attributes for a port of a device * at a requested gid_index, if a valid GID entry exists. * @device: The device to query. * @port_num: The port number on the device where the GID value * is to be queried. * @index: Index of the GID table entry whose attributes are to * be queried. * * rdma_get_gid_attr() acquires reference count of gid attributes from the * cached GID table. Caller must invoke rdma_put_gid_attr() to release * reference to gid attribute regardless of link layer. * * Returns pointer to valid gid attribute or ERR_PTR for the appropriate error * code. */ const struct ib_gid_attr * rdma_get_gid_attr(struct ib_device *device, u32 port_num, int index) { const struct ib_gid_attr *attr = ERR_PTR(-ENODATA); struct ib_gid_table *table; unsigned long flags; if (!rdma_is_port_valid(device, port_num)) return ERR_PTR(-EINVAL); table = rdma_gid_table(device, port_num); if (index < 0 || index >= table->sz) return ERR_PTR(-EINVAL); read_lock_irqsave(&table->rwlock, flags); if (!is_gid_entry_valid(table->data_vec[index])) goto done; get_gid_entry(table->data_vec[index]); attr = &table->data_vec[index]->attr; done: read_unlock_irqrestore(&table->rwlock, flags); return attr; } EXPORT_SYMBOL(rdma_get_gid_attr); /** * rdma_query_gid_table - Reads GID table entries of all the ports of a device up to max_entries. * @device: The device to query. * @entries: Entries where GID entries are returned. * @max_entries: Maximum number of entries that can be returned. * Entries array must be allocated to hold max_entries number of entries. * * Returns number of entries on success or appropriate error code. */ ssize_t rdma_query_gid_table(struct ib_device *device, struct ib_uverbs_gid_entry *entries, size_t max_entries) { const struct ib_gid_attr *gid_attr; ssize_t num_entries = 0, ret; struct ib_gid_table *table; u32 port_num, i; struct net_device *ndev; unsigned long flags; rdma_for_each_port(device, port_num) { table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { if (!is_gid_entry_valid(table->data_vec[i])) continue; if (num_entries >= max_entries) { ret = -EINVAL; goto err; } gid_attr = &table->data_vec[i]->attr; memcpy(&entries->gid, &gid_attr->gid, sizeof(gid_attr->gid)); entries->gid_index = gid_attr->index; entries->port_num = gid_attr->port_num; entries->gid_type = gid_attr->gid_type; ndev = rcu_dereference_protected( gid_attr->ndev, lockdep_is_held(&table->rwlock)); if (ndev) entries->netdev_ifindex = ndev->ifindex; num_entries++; entries++; } read_unlock_irqrestore(&table->rwlock, flags); } return num_entries; err: read_unlock_irqrestore(&table->rwlock, flags); return ret; } EXPORT_SYMBOL(rdma_query_gid_table); /** * rdma_put_gid_attr - Release reference to the GID attribute * @attr: Pointer to the GID attribute whose reference * needs to be released. * * rdma_put_gid_attr() must be used to release reference whose * reference is acquired using rdma_get_gid_attr() or any APIs * which returns a pointer to the ib_gid_attr regardless of link layer * of IB or RoCE. * */ void rdma_put_gid_attr(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry = container_of(attr, struct ib_gid_table_entry, attr); put_gid_entry(entry); } EXPORT_SYMBOL(rdma_put_gid_attr); /** * rdma_hold_gid_attr - Get reference to existing GID attribute * * @attr: Pointer to the GID attribute whose reference * needs to be taken. * * Increase the reference count to a GID attribute to keep it from being * freed. Callers are required to already be holding a reference to attribute. * */ void rdma_hold_gid_attr(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry = container_of(attr, struct ib_gid_table_entry, attr); get_gid_entry(entry); } EXPORT_SYMBOL(rdma_hold_gid_attr); /** * rdma_read_gid_attr_ndev_rcu - Read GID attribute netdevice * which must be in UP state. * * @attr:Pointer to the GID attribute * * Returns pointer to netdevice if the netdevice was attached to GID and * netdevice is in UP state. Caller must hold RCU lock as this API * reads the netdev flags which can change while netdevice migrates to * different net namespace. Returns ERR_PTR with error code otherwise. * */ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry = container_of(attr, struct ib_gid_table_entry, attr); struct ib_device *device = entry->attr.device; struct net_device *ndev = ERR_PTR(-EINVAL); u32 port_num = entry->attr.port_num; struct ib_gid_table *table; unsigned long flags; bool valid; table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); valid = is_gid_entry_valid(table->data_vec[attr->index]); if (valid) { ndev = rcu_dereference(attr->ndev); if (!ndev) ndev = ERR_PTR(-ENODEV); } read_unlock_irqrestore(&table->rwlock, flags); return ndev; } EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu); static int get_lower_dev_vlan(struct net_device *lower_dev, struct netdev_nested_priv *priv) { u16 *vlan_id = (u16 *)priv->data; if (is_vlan_dev(lower_dev)) *vlan_id = vlan_dev_vlan_id(lower_dev); /* We are interested only in first level vlan device, so * always return 1 to stop iterating over next level devices. */ return 1; } /** * rdma_read_gid_l2_fields - Read the vlan ID and source MAC address * of a GID entry. * * @attr: GID attribute pointer whose L2 fields to be read * @vlan_id: Pointer to vlan id to fill up if the GID entry has * vlan id. It is optional. * @smac: Pointer to smac to fill up for a GID entry. It is optional. * * rdma_read_gid_l2_fields() returns 0 on success and returns vlan id * (if gid entry has vlan) and source MAC, or returns error. */ int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, u16 *vlan_id, u8 *smac) { struct netdev_nested_priv priv = { .data = (void *)vlan_id, }; struct net_device *ndev; rcu_read_lock(); ndev = rcu_dereference(attr->ndev); if (!ndev) { rcu_read_unlock(); return -ENODEV; } if (smac) ether_addr_copy(smac, ndev->dev_addr); if (vlan_id) { *vlan_id = 0xffff; if (is_vlan_dev(ndev)) { *vlan_id = vlan_dev_vlan_id(ndev); } else { /* If the netdev is upper device and if it's lower * device is vlan device, consider vlan id of * the lower vlan device for this gid entry. */ netdev_walk_all_lower_dev_rcu(attr->ndev, get_lower_dev_vlan, &priv); } } rcu_read_unlock(); return 0; } EXPORT_SYMBOL(rdma_read_gid_l2_fields); static int config_non_roce_gid_cache(struct ib_device *device, u32 port, struct ib_port_attr *tprops) { struct ib_gid_attr gid_attr = {}; struct ib_gid_table *table; int ret = 0; int i; gid_attr.device = device; gid_attr.port_num = port; table = rdma_gid_table(device, port); mutex_lock(&table->lock); for (i = 0; i < tprops->gid_tbl_len; ++i) { if (!device->ops.query_gid) continue; ret = device->ops.query_gid(device, port, i, &gid_attr.gid); if (ret) { dev_warn(&device->dev, "query_gid failed (%d) for index %d\n", ret, i); goto err; } if (rdma_protocol_iwarp(device, port)) { struct net_device *ndev; ndev = ib_device_get_netdev(device, port); if (!ndev) continue; RCU_INIT_POINTER(gid_attr.ndev, ndev); dev_put(ndev); } gid_attr.index = i; tprops->subnet_prefix = be64_to_cpu(gid_attr.gid.global.subnet_prefix); add_modify_gid(table, &gid_attr); } err: mutex_unlock(&table->lock); return ret; } static int ib_cache_update(struct ib_device *device, u32 port, bool update_gids, bool update_pkeys, bool enforce_security) { struct ib_port_attr *tprops = NULL; struct ib_pkey_cache *pkey_cache = NULL; struct ib_pkey_cache *old_pkey_cache = NULL; int i; int ret; if (!rdma_is_port_valid(device, port)) return -EINVAL; tprops = kmalloc(sizeof *tprops, GFP_KERNEL); if (!tprops) return -ENOMEM; ret = ib_query_port(device, port, tprops); if (ret) { dev_warn(&device->dev, "ib_query_port failed (%d)\n", ret); goto err; } if (!rdma_protocol_roce(device, port) && update_gids) { ret = config_non_roce_gid_cache(device, port, tprops); if (ret) goto err; } update_pkeys &= !!tprops->pkey_tbl_len; if (update_pkeys) { pkey_cache = kmalloc(struct_size(pkey_cache, table, tprops->pkey_tbl_len), GFP_KERNEL); if (!pkey_cache) { ret = -ENOMEM; goto err; } pkey_cache->table_len = tprops->pkey_tbl_len; for (i = 0; i < pkey_cache->table_len; ++i) { ret = ib_query_pkey(device, port, i, pkey_cache->table + i); if (ret) { dev_warn(&device->dev, "ib_query_pkey failed (%d) for index %d\n", ret, i); goto err; } } } write_lock_irq(&device->cache_lock); if (update_pkeys) { old_pkey_cache = device->port_data[port].cache.pkey; device->port_data[port].cache.pkey = pkey_cache; } device->port_data[port].cache.lmc = tprops->lmc; device->port_data[port].cache.port_state = tprops->state; device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix; write_unlock_irq(&device->cache_lock); if (enforce_security) ib_security_cache_change(device, port, tprops->subnet_prefix); kfree(old_pkey_cache); kfree(tprops); return 0; err: kfree(pkey_cache); kfree(tprops); return ret; } static void ib_cache_event_task(struct work_struct *_work) { struct ib_update_work *work = container_of(_work, struct ib_update_work, work); int ret; /* Before distributing the cache update event, first sync * the cache. */ ret = ib_cache_update(work->event.device, work->event.element.port_num, work->event.event == IB_EVENT_GID_CHANGE, work->event.event == IB_EVENT_PKEY_CHANGE, work->enforce_security); /* GID event is notified already for individual GID entries by * dispatch_gid_change_event(). Hence, notifiy for rest of the * events. */ if (!ret && work->event.event != IB_EVENT_GID_CHANGE) ib_dispatch_event_clients(&work->event); kfree(work); } static void ib_generic_event_task(struct work_struct *_work) { struct ib_update_work *work = container_of(_work, struct ib_update_work, work); ib_dispatch_event_clients(&work->event); kfree(work); } static bool is_cache_update_event(const struct ib_event *event) { return (event->event == IB_EVENT_PORT_ERR || event->event == IB_EVENT_PORT_ACTIVE || event->event == IB_EVENT_LID_CHANGE || event->event == IB_EVENT_PKEY_CHANGE || event->event == IB_EVENT_CLIENT_REREGISTER || event->event == IB_EVENT_GID_CHANGE); } /** * ib_dispatch_event - Dispatch an asynchronous event * @event:Event to dispatch * * Low-level drivers must call ib_dispatch_event() to dispatch the * event to all registered event handlers when an asynchronous event * occurs. */ void ib_dispatch_event(const struct ib_event *event) { struct ib_update_work *work; work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) return; if (is_cache_update_event(event)) INIT_WORK(&work->work, ib_cache_event_task); else INIT_WORK(&work->work, ib_generic_event_task); work->event = *event; if (event->event == IB_EVENT_PKEY_CHANGE || event->event == IB_EVENT_GID_CHANGE) work->enforce_security = true; queue_work(ib_wq, &work->work); } EXPORT_SYMBOL(ib_dispatch_event); int ib_cache_setup_one(struct ib_device *device) { u32 p; int err; err = gid_table_setup_one(device); if (err) return err; rdma_for_each_port (device, p) { err = ib_cache_update(device, p, true, true, true); if (err) return err; } return 0; } void ib_cache_release_one(struct ib_device *device) { u32 p; /* * The release function frees all the cache elements. * This function should be called as part of freeing * all the device's resources when the cache could no * longer be accessed. */ rdma_for_each_port (device, p) kfree(device->port_data[p].cache.pkey); gid_table_release_one(device); } void ib_cache_cleanup_one(struct ib_device *device) { /* The cleanup function waits for all in-progress workqueue * elements and cleans up the GID cache. This function should be * called after the device was removed from the devices list and * all clients were removed, so the cache exists but is * non-functional and shouldn't be updated anymore. */ flush_workqueue(ib_wq); gid_table_cleanup_one(device); /* * Flush the wq second time for any pending GID delete work. */ flush_workqueue(ib_wq); }
36 37 48 48 31 5 31 30 14 14 4 12 13 23 22 28 46 47 5 5 5 5 4 2 2 4 5 21 5 19 21 10 16 12 11 11 5 5 5 10 2 2 2 2 2 7 9 1 1 9 14 10 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 // SPDX-License-Identifier: GPL-2.0-only /* * IPv6 library code, needed by static components when full IPv6 support is * not configured or static. */ #include <linux/export.h> #include <net/ipv6.h> /* * find out if nexthdr is a well-known extension header or a protocol */ bool ipv6_ext_hdr(u8 nexthdr) { /* * find out if nexthdr is an extension header or a protocol */ return (nexthdr == NEXTHDR_HOP) || (nexthdr == NEXTHDR_ROUTING) || (nexthdr == NEXTHDR_FRAGMENT) || (nexthdr == NEXTHDR_AUTH) || (nexthdr == NEXTHDR_NONE) || (nexthdr == NEXTHDR_DEST); } EXPORT_SYMBOL(ipv6_ext_hdr); /* * Skip any extension headers. This is used by the ICMP module. * * Note that strictly speaking this conflicts with RFC 2460 4.0: * ...The contents and semantics of each extension header determine whether * or not to proceed to the next header. Therefore, extension headers must * be processed strictly in the order they appear in the packet; a * receiver must not, for example, scan through a packet looking for a * particular kind of extension header and process that header prior to * processing all preceding ones. * * We do exactly this. This is a protocol bug. We can't decide after a * seeing an unknown discard-with-error flavour TLV option if it's a * ICMP error message or not (errors should never be send in reply to * ICMP error messages). * * But I see no other way to do this. This might need to be reexamined * when Linux implements ESP (and maybe AUTH) headers. * --AK * * This function parses (probably truncated) exthdr set "hdr". * "nexthdrp" initially points to some place, * where type of the first header can be found. * * It skips all well-known exthdrs, and returns pointer to the start * of unparsable area i.e. the first header with unknown type. * If it is not NULL *nexthdr is updated by type/protocol of this header. * * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL. * - it may return pointer pointing beyond end of packet, * if the last recognized header is truncated in the middle. * - if packet is truncated, so that all parsed headers are skipped, * it returns NULL. * - First fragment header is skipped, not-first ones * are considered as unparsable. * - Reports the offset field of the final fragment header so it is * possible to tell whether this is a first fragment, later fragment, * or not fragmented. * - ESP is unparsable for now and considered like * normal payload protocol. * - Note also special handling of AUTH header. Thanks to IPsec wizards. * * --ANK (980726) */ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, __be16 *frag_offp) { u8 nexthdr = *nexthdrp; *frag_offp = 0; while (ipv6_ext_hdr(nexthdr)) { struct ipv6_opt_hdr _hdr, *hp; int hdrlen; if (nexthdr == NEXTHDR_NONE) return -1; hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -1; if (nexthdr == NEXTHDR_FRAGMENT) { __be16 _frag_off, *fp; fp = skb_header_pointer(skb, start+offsetof(struct frag_hdr, frag_off), sizeof(_frag_off), &_frag_off); if (!fp) return -1; *frag_offp = *fp; if (ntohs(*frag_offp) & ~0x7) break; hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) hdrlen = ipv6_authlen(hp); else hdrlen = ipv6_optlen(hp); nexthdr = hp->nexthdr; start += hdrlen; } *nexthdrp = nexthdr; return start; } EXPORT_SYMBOL(ipv6_skip_exthdr); int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type) { const unsigned char *nh = skb_network_header(skb); int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); struct ipv6_opt_hdr *hdr; int len; if (offset + 2 > packet_len) goto bad; hdr = (struct ipv6_opt_hdr *)(nh + offset); len = ((hdr->hdrlen + 1) << 3); if (offset + len > packet_len) goto bad; offset += 2; len -= 2; while (len > 0) { int opttype = nh[offset]; int optlen; if (opttype == type) return offset; switch (opttype) { case IPV6_TLV_PAD1: optlen = 1; break; default: if (len < 2) goto bad; optlen = nh[offset + 1] + 2; if (optlen > len) goto bad; break; } offset += optlen; len -= optlen; } /* not_found */ bad: return -1; } EXPORT_SYMBOL_GPL(ipv6_find_tlv); /* * find the offset to specified header or the protocol number of last header * if target < 0. "last header" is transport protocol header, ESP, or * "No next header". * * Note that *offset is used as input/output parameter, and if it is not zero, * then it must be a valid offset to an inner IPv6 header. This can be used * to explore inner IPv6 header, eg. ICMPv6 error messages. * * If target header is found, its offset is set in *offset and return protocol * number. Otherwise, return -1. * * If the first fragment doesn't contain the final protocol header or * NEXTHDR_NONE it is considered invalid. * * Note that non-1st fragment is special case that "the protocol number * of last header" is "next header" field in Fragment header. In this case, * *offset is meaningless and fragment offset is stored in *fragoff if fragoff * isn't NULL. * * if flags is not NULL and it's a fragment, then the frag flag * IP6_FH_F_FRAG will be set. If it's an AH header, the * IP6_FH_F_AUTH flag is set and target < 0, then this function will * stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this * function will skip all those routing headers, where segements_left was 0. */ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, unsigned short *fragoff, int *flags) { unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; bool found; if (fragoff) *fragoff = 0; if (*offset) { struct ipv6hdr _ip6, *ip6; ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6); if (!ip6 || (ip6->version != 6)) return -EBADMSG; start = *offset + sizeof(struct ipv6hdr); nexthdr = ip6->nexthdr; } do { struct ipv6_opt_hdr _hdr, *hp; unsigned int hdrlen; found = (nexthdr == target); if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { if (target < 0 || found) break; return -ENOENT; } hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -EBADMSG; if (nexthdr == NEXTHDR_ROUTING) { struct ipv6_rt_hdr _rh, *rh; rh = skb_header_pointer(skb, start, sizeof(_rh), &_rh); if (!rh) return -EBADMSG; if (flags && (*flags & IP6_FH_F_SKIP_RH) && rh->segments_left == 0) found = false; } if (nexthdr == NEXTHDR_FRAGMENT) { unsigned short _frag_off; __be16 *fp; if (flags) /* Indicate that this is a fragment */ *flags |= IP6_FH_F_FRAG; fp = skb_header_pointer(skb, start+offsetof(struct frag_hdr, frag_off), sizeof(_frag_off), &_frag_off); if (!fp) return -EBADMSG; _frag_off = ntohs(*fp) & ~0x7; if (_frag_off) { if (target < 0 && ((!ipv6_ext_hdr(hp->nexthdr)) || hp->nexthdr == NEXTHDR_NONE)) { if (fragoff) *fragoff = _frag_off; return hp->nexthdr; } if (!found) return -ENOENT; if (fragoff) *fragoff = _frag_off; break; } hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0)) break; hdrlen = ipv6_authlen(hp); } else hdrlen = ipv6_optlen(hp); if (!found) { nexthdr = hp->nexthdr; start += hdrlen; } } while (!found); *offset = start; return nexthdr; } EXPORT_SYMBOL(ipv6_find_hdr);
3 7 8 7 3 9 5 1 1 1 1 1 1 5 3 1 1 1 1 1 1 29 30 31 32 33 32 1144 1149 1135 919 1 1 921 1 1141 1142 1155 95 95 95 76 90 10 11 10 10 10 7 7 7 10 11 9 9 179 181 183 183 181 88 94 95 172 132 132 167 172 556 550 558 541 433 156 5 5 5 5 5 1 3 3 5 5 5 2 2 2 2 2 2 687 685 691 683 664 683 143 685 134 558 558 568 565 660 137 681 159 159 137 23 107 149 149 159 688 688 554 132 469 669 667 684 23 24 23 25 25 5 4 4 5 5 129 130 130 40 89 122 124 124 116 114 116 93 23 36 76 77 42 132 134 133 135 131 132 40 93 100 94 117 119 114 110 111 114 112 112 113 89 23 98 82 107 108 107 45 46 24 22 16 44 27 45 101 100 30 71 11 96 73 96 3 92 2 77 76 42 80 3 5 19 3 17 14 8 19 5 47 102 89 89 3 3 14 14 5 5 62 61 60 49 12 59 56 59 56 56 56 55 55 1 46 32 32 31 32 3 29 46 16 44 49 53 64 11 54 51 4 4 4 3 3 2 2 2 2 2 2 4 4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 2 2 1 3 4 2 1 1 2 1 1 4 4 4 101 101 1 100 100 100 100 101 101 101 100 61 101 101 101 1 100 101 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/read_write.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/slab.h> #include <linux/stat.h> #include <linux/sched/xacct.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/fs.h> #include "internal.h" #include <linux/uaccess.h> #include <asm/unistd.h> const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .mmap = generic_file_readonly_mmap, .splice_read = filemap_splice_read, }; EXPORT_SYMBOL(generic_ro_fops); static inline bool unsigned_offsets(struct file *file) { return file->f_mode & FMODE_UNSIGNED_OFFSET; } /** * vfs_setpos - update the file offset for lseek * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * * This is a low-level filesystem helper for updating the file offset to * the value specified by @offset if the given offset is valid and it is * not equal to the current file offset. * * Return the specified offset on success and -EINVAL on invalid offset. */ loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; if (offset > maxsize) return -EINVAL; if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } return offset; } EXPORT_SYMBOL(vfs_setpos); /** * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @maxsize: max size of this file in file system * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom * maximum file size and a custom EOF position, for e.g. hashed directories * * Synchronization: * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. * read/writes behave like SEEK_SET against seeks. */ loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof) { switch (whence) { case SEEK_END: offset += eof; break; case SEEK_CUR: /* * Here we special-case the lseek(fd, 0, SEEK_CUR) * position-querying operation. Avoid rewriting the "same" * f_pos value back to the file because a concurrent read(), * write() or lseek() might have altered it */ if (offset == 0) return file->f_pos; /* * f_lock protects against read/modify/write race with other * SEEK_CURs. Note that parallel writes and reads behave * like SEEK_SET. */ spin_lock(&file->f_lock); offset = vfs_setpos(file, file->f_pos + offset, maxsize); spin_unlock(&file->f_lock); return offset; case SEEK_DATA: /* * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ if ((unsigned long long)offset >= eof) return -ENXIO; break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ if ((unsigned long long)offset >= eof) return -ENXIO; offset = eof; break; } return vfs_setpos(file, offset, maxsize); } EXPORT_SYMBOL(generic_file_llseek_size); /** * generic_file_llseek - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is a generic implemenation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by * @offset and @whence. */ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; return generic_file_llseek_size(file, offset, whence, inode->i_sb->s_maxbytes, i_size_read(inode)); } EXPORT_SYMBOL(generic_file_llseek); /** * fixed_size_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: size of the file * */ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: return generic_file_llseek_size(file, offset, whence, size, size); default: return -EINVAL; } } EXPORT_SYMBOL(fixed_size_llseek); /** * no_seek_end_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * */ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, OFFSET_MAX, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek); /** * no_seek_end_llseek_size - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: maximal offset allowed * */ loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, size, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek_size); /** * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is an implementation of ->llseek useable for the rare special case when * userspace expects the seek to succeed but the (device) file is actually not * able to perform the seek. In this case you use noop_llseek() instead of * falling back to the default implementation of ->llseek. */ loff_t noop_llseek(struct file *file, loff_t offset, int whence) { return file->f_pos; } EXPORT_SYMBOL(noop_llseek); loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); loff_t retval; inode_lock(inode); switch (whence) { case SEEK_END: offset += i_size_read(inode); break; case SEEK_CUR: if (offset == 0) { retval = file->f_pos; goto out; } offset += file->f_pos; break; case SEEK_DATA: /* * In the generic case the entire file is data, so as * long as offset isn't at the end of the file then the * offset is data. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so * as long as offset isn't i_size or larger, return * i_size. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } offset = inode->i_size; break; } retval = -EINVAL; if (offset >= 0 || unsigned_offsets(file)) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } retval = offset; } out: inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { if (!(file->f_mode & FMODE_LSEEK)) return -ESPIPE; return file->f_op->llseek(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) { off_t retval; struct fd f = fdget_pos(fd); if (!f.file) return -EBADF; retval = -EINVAL; if (whence <= SEEK_MAX) { loff_t res = vfs_llseek(f.file, offset, whence); retval = res; if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } fdput_pos(f); return retval; } SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #endif #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ defined(__ARCH_WANT_SYS_LLSEEK) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) { int retval; struct fd f = fdget_pos(fd); loff_t offset; if (!f.file) return -EBADF; retval = -EINVAL; if (whence > SEEK_MAX) goto out_putf; offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, whence); retval = (int)offset; if (offset >= 0) { retval = -EFAULT; if (!copy_to_user(result, &offset, sizeof(offset))) retval = 0; } out_putf: fdput_pos(f); return retval; } #endif int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { int mask = read_write == READ ? MAY_READ : MAY_WRITE; int ret; if (unlikely((ssize_t) count < 0)) return -EINVAL; if (ppos) { loff_t pos = *ppos; if (unlikely(pos < 0)) { if (!unsigned_offsets(file)) return -EINVAL; if (count >= -pos) /* both values are in 0..LLONG_MAX */ return -EOVERFLOW; } else if (unlikely((loff_t) (pos + count) < 0)) { if (!unsigned_offsets(file)) return -EINVAL; } } ret = security_file_permission(file, mask); if (ret) return ret; return fsnotify_file_area_perm(file, mask, ppos, count); } EXPORT_SYMBOL(rw_verify_area); static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_DEST, buf, len); ret = filp->f_op->read_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } static int warn_unsupported(struct file *file, const char *op) { pr_warn_ratelimited( "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) return -EINVAL; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; /* * Also fail if ->read_iter and ->read are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->read_iter || file->f_op->read)) return warn_unsupported(file, "read"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; return __kernel_read(file, buf, count, pos); } EXPORT_SYMBOL(kernel_read); ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else if (file->f_op->read_iter) ret = new_sync_read(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); ret = filp->f_op->write_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0 && ppos) *ppos = kiocb.ki_pos; return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) { struct kiocb kiocb; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; /* * Also fail if ->write_iter and ->write are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->write_iter || file->f_op->write)) return warn_unsupported(file, "write"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; ret = file->f_op->write_iter(&kiocb, from); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = (void *)buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct iov_iter iter; iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); return __kernel_write_iter(file, &iter, pos); } /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually * wants this _and_ can be built as a module. So we need to export * this symbol for autofs, even though it really isn't appropriate * for any other kernel modules. */ EXPORT_SYMBOL_GPL(__kernel_write); ssize_t kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; file_start_write(file); ret = __kernel_write(file, buf, count, pos); file_end_write(file); return ret; } EXPORT_SYMBOL(kernel_write); ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else if (file->f_op->write_iter) ret = new_sync_write(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); file_end_write(file); return ret; } /* file_ppos returns &file->f_pos or NULL if file is stream */ static inline loff_t *file_ppos(struct file *file) { return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_read(f.file, buf, count, ppos); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } return ret; } SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { return ksys_read(fd, buf, count); } ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_write(f.file, buf, count, ppos); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } return ret; } SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { return ksys_write(fd, buf, count); } ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) ret = vfs_read(f.file, buf, count, &pos); fdput(f); } return ret; } SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, size_t, count, loff_t, pos) { return ksys_pread64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PWRITE) ret = vfs_write(f.file, buf, count, &pos); fdput(f); } return ret; } SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, loff_t, pos) { return ksys_pwrite64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { struct kiocb kiocb; ssize_t ret; init_sync_kiocb(&kiocb, filp); ret = kiocb_set_rw_flags(&kiocb, flags); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); if (type == READ) ret = filp->f_op->read_iter(&kiocb, iter); else ret = filp->f_op->write_iter(&kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { ssize_t ret = 0; if (flags & ~RWF_HIPRI) return -EOPNOTSUPP; while (iov_iter_count(iter)) { ssize_t nr; if (type == READ) { nr = filp->f_op->read(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } else { nr = filp->f_op->write(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } if (nr < 0) { if (!ret) ret = nr; break; } ret += nr; if (nr != iter_iov_len(iter)) break; iov_iter_advance(iter, nr); } return ret; } ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; ret = file->f_op->read_iter(iocb, iter); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_read); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, ppos, tot_len); if (ret < 0) return ret; ret = do_iter_readv_writev(file, iter, ppos, READ, flags); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iter_read); /* * Caller is responsible for calling kiocb_end_write() on completion * if async iocb was queued. */ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->write_iter) return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; kiocb_start_write(iocb); ret = file->f_op->write_iter(iocb, iter); if (ret != -EIOCBQUEUED) kiocb_end_write(iocb); if (ret > 0) fsnotify_modify(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_write); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!file->f_op->write_iter) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, ppos, tot_len); if (ret < 0) return ret; file_start_write(file); ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL(vfs_iter_write); static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, pos, tot_len); if (ret < 0) goto out; if (file->f_op->read_iter) ret = do_iter_readv_writev(file, &iter, pos, READ, flags); else ret = do_loop_readv_writev(file, &iter, pos, READ, flags); out: if (ret >= 0) fsnotify_access(file); kfree(iov); return ret; } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(WRITE, file, pos, tot_len); if (ret < 0) goto out; file_start_write(file); if (file->f_op->write_iter) ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); else ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); out: kfree(iov); return ret; } static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_readv(f.file, vec, vlen, ppos, flags); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_writev(f.file, vec, vlen, ppos, flags); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) { #define HALF_LONG_BITS (BITS_PER_LONG / 2) return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; } static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) ret = vfs_readv(f.file, vec, vlen, &pos, flags); fdput(f); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PWRITE) ret = vfs_writev(f.file, vec, vlen, &pos, flags); fdput(f); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_readv(fd, vec, vlen, 0); } SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_writev(fd, vec, vlen, 0); } SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_preadv(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_pwritev(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } /* * Various compat syscalls. Note that they all pretend to take a native * iovec - import_iovec will properly treat those as compat_iovecs based on * in_compat_syscall(). */ #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_preadv(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_preadv(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_pwritev(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_pwritev(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { struct fd in, out; struct inode *in_inode, *out_inode; struct pipe_inode_info *opipe; loff_t pos; loff_t out_pos; ssize_t retval; int fl; /* * Get input file, and verify that it is ok.. */ retval = -EBADF; in = fdget(in_fd); if (!in.file) goto out; if (!(in.file->f_mode & FMODE_READ)) goto fput_in; retval = -ESPIPE; if (!ppos) { pos = in.file->f_pos; } else { pos = *ppos; if (!(in.file->f_mode & FMODE_PREAD)) goto fput_in; } retval = rw_verify_area(READ, in.file, &pos, count); if (retval < 0) goto fput_in; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; /* * Get output file, and verify that it is ok.. */ retval = -EBADF; out = fdget(out_fd); if (!out.file) goto fput_in; if (!(out.file->f_mode & FMODE_WRITE)) goto fput_out; in_inode = file_inode(in.file); out_inode = file_inode(out.file); out_pos = out.file->f_pos; if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); if (unlikely(pos + count > max)) { retval = -EOVERFLOW; if (pos >= max) goto fput_out; count = max - pos; } fl = 0; #if 0 /* * We need to debate whether we can enable this or not. The * man page documents EAGAIN return for the output at least, * and the application is arguably buggy if it doesn't expect * EAGAIN on a non-blocking file descriptor. */ if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif opipe = get_pipe_info(out.file, true); if (!opipe) { retval = rw_verify_area(WRITE, out.file, &out_pos, count); if (retval < 0) goto fput_out; retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); } else { if (out.file->f_flags & O_NONBLOCK) fl |= SPLICE_F_NONBLOCK; retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl); } if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(in.file); fsnotify_modify(out.file); out.file->f_pos = out_pos; if (ppos) *ppos = pos; else in.file->f_pos = pos; } inc_syscr(current); inc_syscw(current); if (pos > max) retval = -EOVERFLOW; fput_out: fdput(out); fput_in: fdput(in); out: return retval; } SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, compat_off_t __user *, offset, compat_size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, compat_loff_t __user *, offset, compat_size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #endif /* * Performs necessary checks before doing a file copy * * Can adjust amount of bytes to copy via @req_count argument. * Returns appropriate error code that caller should return or * zero in case the copy should be allowed. */ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t *req_count, unsigned int flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); uint64_t count = *req_count; loff_t size_in; int ret; ret = generic_file_rw_checks(file_in, file_out); if (ret) return ret; /* * We allow some filesystems to handle cross sb copy, but passing * a file of the wrong filesystem type to filesystem driver can result * in an attempt to dereference the wrong type of ->private_data, so * avoid doing that until we really have a good reason. * * nfs and cifs define several different file_system_type structures * and several different sets of file_operations, but they all end up * using the same ->copy_file_range() function pointer. */ if (flags & COPY_FILE_SPLICE) { /* cross sb splice is allowed */ } else if (file_out->f_op->copy_file_range) { if (file_in->f_op->copy_file_range != file_out->f_op->copy_file_range) return -EXDEV; } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { return -EXDEV; } /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) return -ETXTBSY; /* Ensure offsets don't wrap. */ if (pos_in + count < pos_in || pos_out + count < pos_out) return -EOVERFLOW; /* Shorten the copy to EOF */ size_in = i_size_read(inode_in); if (pos_in >= size_in) count = 0; else count = min(count, size_in - (uint64_t)pos_in); ret = generic_write_check_limits(file_out, pos_out, &count); if (ret) return ret; /* Don't allow overlapped copying within the same file. */ if (inode_in == inode_out && pos_out + count > pos_in && pos_out < pos_in + count) return -EINVAL; *req_count = count; return 0; } /* * copy_file_range() differs from regular file read and write in that it * specifically allows return partial success. When it does so is up to * the copy_file_range method. */ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { ssize_t ret; bool splice = flags & COPY_FILE_SPLICE; bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; if (flags & ~COPY_FILE_SPLICE) return -EINVAL; ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, flags); if (unlikely(ret)) return ret; ret = rw_verify_area(READ, file_in, &pos_in, len); if (unlikely(ret)) return ret; ret = rw_verify_area(WRITE, file_out, &pos_out, len); if (unlikely(ret)) return ret; if (len == 0) return 0; file_start_write(file_out); /* * Cloning is supported by more file systems, so we implement copy on * same sb using clone, but for filesystems where both clone and copy * are supported (e.g. nfs,cifs), we only call the copy method. */ if (!splice && file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); /* fallback to splice */ if (ret <= 0) splice = true; } else if (samesb) { /* Fallback to splice for same sb copy for backward compat */ splice = true; } file_end_write(file_out); if (!splice) goto done; /* * We can get here for same sb copy of filesystems that do not implement * ->copy_file_range() in case filesystem does not support clone or in * case filesystem supports clone but rejected the clone request (e.g. * because it was not block aligned). * * In both cases, fall back to kernel copy so we are able to maintain a * consistent story about which filesystems support copy_file_range() * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). * * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE * for server-side-copy between any two sb. * * In any case, we call do_splice_direct() and not splice_file_range(), * without file_start_write() held, to avoid possible deadlocks related * to splicing from input file, while file_start_write() is held on * the output file on a different sb. */ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, min_t(size_t, len, MAX_RW_COUNT), 0); done: if (ret > 0) { fsnotify_access(file_in); add_rchar(current, ret); fsnotify_modify(file_out); add_wchar(current, ret); } inc_syscr(current); inc_syscw(current); return ret; } EXPORT_SYMBOL(vfs_copy_file_range); SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { loff_t pos_in; loff_t pos_out; struct fd f_in; struct fd f_out; ssize_t ret = -EBADF; f_in = fdget(fd_in); if (!f_in.file) goto out2; f_out = fdget(fd_out); if (!f_out.file) goto out1; ret = -EFAULT; if (off_in) { if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) goto out; } else { pos_in = f_in.file->f_pos; } if (off_out) { if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) goto out; } else { pos_out = f_out.file->f_pos; } ret = -EINVAL; if (flags != 0) goto out; ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len, flags); if (ret > 0) { pos_in += ret; pos_out += ret; if (off_in) { if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) ret = -EFAULT; } else { f_in.file->f_pos = pos_in; } if (off_out) { if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) ret = -EFAULT; } else { f_out.file->f_pos = pos_out; } } out: fdput(f_out); out1: fdput(f_in); out2: return ret; } /* * Don't operate on ranges the page cache doesn't support, and don't exceed the * LFS limits. If pos is under the limit it becomes a short access. If it * exceeds the limit we return -EFBIG. */ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) { struct inode *inode = file->f_mapping->host; loff_t max_size = inode->i_sb->s_maxbytes; loff_t limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY) { if (pos >= limit) { send_sig(SIGXFSZ, current, 0); return -EFBIG; } *count = min(*count, limit - pos); } if (!(file->f_flags & O_LARGEFILE)) max_size = MAX_NON_LFS; if (unlikely(pos >= max_size)) return -EFBIG; *count = min(*count, max_size - pos); return 0; } EXPORT_SYMBOL_GPL(generic_write_check_limits); /* Like generic_write_checks(), but takes size of write instead of iter. */ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; if (IS_SWAPFILE(inode)) return -ETXTBSY; if (!*count) return 0; if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); if ((iocb->ki_flags & IOCB_NOWAIT) && !((iocb->ki_flags & IOCB_DIRECT) || (file->f_op->fop_flags & FOP_BUFFER_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); } EXPORT_SYMBOL(generic_write_checks_count); /* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { loff_t count = iov_iter_count(from); int ret; ret = generic_write_checks_count(iocb, &count); if (ret) return ret; iov_iter_truncate(from, count); return iov_iter_count(from); } EXPORT_SYMBOL(generic_write_checks); /* * Performs common checks before doing a file copy/clone * from @file_in to @file_out. */ int generic_file_rw_checks(struct file *file_in, struct file *file_out) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); /* Don't copy dirs, pipes, sockets... */ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; if (!(file_in->f_mode & FMODE_READ) || !(file_out->f_mode & FMODE_WRITE) || (file_out->f_flags & O_APPEND)) return -EBADF; return 0; }
421 49 7 7 370 82 97 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 /* SPDX-License-Identifier: GPL-2.0-only */ /* * kernfs.h - pseudo filesystem decoupled from vfs locking */ #ifndef __LINUX_KERNFS_H #define __LINUX_KERNFS_H #include <linux/err.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/idr.h> #include <linux/lockdep.h> #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/bug.h> #include <linux/types.h> #include <linux/uidgid.h> #include <linux/wait.h> #include <linux/rwsem.h> #include <linux/cache.h> struct file; struct dentry; struct iattr; struct seq_file; struct vm_area_struct; struct vm_operations_struct; struct super_block; struct file_system_type; struct poll_table_struct; struct fs_context; struct kernfs_fs_context; struct kernfs_open_node; struct kernfs_iattrs; /* * NR_KERNFS_LOCK_BITS determines size (NR_KERNFS_LOCKS) of hash * table of locks. * Having a small hash table would impact scalability, since * more and more kernfs_node objects will end up using same lock * and having a very large hash table would waste memory. * * At the moment size of hash table of locks is being set based on * the number of CPUs as follows: * * NR_CPU NR_KERNFS_LOCK_BITS NR_KERNFS_LOCKS * 1 1 2 * 2-3 2 4 * 4-7 4 16 * 8-15 6 64 * 16-31 8 256 * 32 and more 10 1024 * * The above relation between NR_CPU and number of locks is based * on some internal experimentation which involved booting qemu * with different values of smp, performing some sysfs operations * on all CPUs and observing how increase in number of locks impacts * completion time of these sysfs operations on each CPU. */ #ifdef CONFIG_SMP #define NR_KERNFS_LOCK_BITS (2 * (ilog2(NR_CPUS < 32 ? NR_CPUS : 32))) #else #define NR_KERNFS_LOCK_BITS 1 #endif #define NR_KERNFS_LOCKS (1 << NR_KERNFS_LOCK_BITS) /* * There's one kernfs_open_file for each open file and one kernfs_open_node * for each kernfs_node with one or more open files. * * filp->private_data points to seq_file whose ->private points to * kernfs_open_file. * * kernfs_open_files are chained at kernfs_open_node->files, which is * protected by kernfs_global_locks.open_file_mutex[i]. * * To reduce possible contention in sysfs access, arising due to single * locks, use an array of locks (e.g. open_file_mutex) and use kernfs_node * object address as hash keys to get the index of these locks. * * Hashed mutexes are safe to use here because operations using these don't * rely on global exclusion. * * In future we intend to replace other global locks with hashed ones as well. * kernfs_global_locks acts as a holder for all such hash tables. */ struct kernfs_global_locks { struct mutex open_file_mutex[NR_KERNFS_LOCKS]; }; enum kernfs_node_type { KERNFS_DIR = 0x0001, KERNFS_FILE = 0x0002, KERNFS_LINK = 0x0004, }; #define KERNFS_TYPE_MASK 0x000f #define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK #define KERNFS_MAX_USER_XATTRS 128 #define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10) enum kernfs_node_flag { KERNFS_ACTIVATED = 0x0010, KERNFS_NS = 0x0020, KERNFS_HAS_SEQ_SHOW = 0x0040, KERNFS_HAS_MMAP = 0x0080, KERNFS_LOCKDEP = 0x0100, KERNFS_HIDDEN = 0x0200, KERNFS_SUICIDAL = 0x0400, KERNFS_SUICIDED = 0x0800, KERNFS_EMPTY_DIR = 0x1000, KERNFS_HAS_RELEASE = 0x2000, KERNFS_REMOVING = 0x4000, }; /* @flags for kernfs_create_root() */ enum kernfs_root_flag { /* * kernfs_nodes are created in the deactivated state and invisible. * They require explicit kernfs_activate() to become visible. This * can be used to make related nodes become visible atomically * after all nodes are created successfully. */ KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001, /* * For regular files, if the opener has CAP_DAC_OVERRIDE, open(2) * succeeds regardless of the RW permissions. sysfs had an extra * layer of enforcement where open(2) fails with -EACCES regardless * of CAP_DAC_OVERRIDE if the permission doesn't have the * respective read or write access at all (none of S_IRUGO or * S_IWUGO) or the respective operation isn't implemented. The * following flag enables that behavior. */ KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK = 0x0002, /* * The filesystem supports exportfs operation, so userspace can use * fhandle to access nodes of the fs. */ KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004, /* * Support user xattrs to be written to nodes rooted at this root. */ KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008, }; /* type-specific structures for kernfs_node union members */ struct kernfs_elem_dir { unsigned long subdirs; /* children rbtree starts here and goes through kn->rb */ struct rb_root children; /* * The kernfs hierarchy this directory belongs to. This fits * better directly in kernfs_node but is here to save space. */ struct kernfs_root *root; /* * Monotonic revision counter, used to identify if a directory * node has changed during negative dentry revalidation. */ unsigned long rev; }; struct kernfs_elem_symlink { struct kernfs_node *target_kn; }; struct kernfs_elem_attr { const struct kernfs_ops *ops; struct kernfs_open_node __rcu *open; loff_t size; struct kernfs_node *notify_next; /* for kernfs_notify() */ }; /* * kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs node is represented by single kernfs_node. Most fields are * private to kernfs and shouldn't be accessed directly by kernfs users. * * As long as count reference is held, the kernfs_node itself is * accessible. Dereferencing elem or any other outer entity requires * active reference. */ struct kernfs_node { atomic_t count; atomic_t active; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif /* * Use kernfs_get_parent() and kernfs_name/path() instead of * accessing the following two fields directly. If the node is * never moved to a different parent, it is safe to access the * parent directly. */ struct kernfs_node *parent; const char *name; struct rb_node rb; const void *ns; /* namespace tag */ unsigned int hash; /* ns + name hash */ unsigned short flags; umode_t mode; union { struct kernfs_elem_dir dir; struct kernfs_elem_symlink symlink; struct kernfs_elem_attr attr; }; /* * 64bit unique ID. On 64bit ino setups, id is the ino. On 32bit, * the low 32bits are ino and upper generation. */ u64 id; void *priv; struct kernfs_iattrs *iattr; struct rcu_head rcu; }; /* * kernfs_syscall_ops may be specified on kernfs_create_root() to support * syscalls. These optional callbacks are invoked on the matching syscalls * and can perform any kernfs operations which don't necessarily have to be * the exact operation requested. An active reference is held for each * kernfs_node parameter. */ struct kernfs_syscall_ops { int (*show_options)(struct seq_file *sf, struct kernfs_root *root); int (*mkdir)(struct kernfs_node *parent, const char *name, umode_t mode); int (*rmdir)(struct kernfs_node *kn); int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name); int (*show_path)(struct seq_file *sf, struct kernfs_node *kn, struct kernfs_root *root); }; struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root); struct kernfs_open_file { /* published fields */ struct kernfs_node *kn; struct file *file; struct seq_file *seq_file; void *priv; /* private fields, do not use outside kernfs proper */ struct mutex mutex; struct mutex prealloc_mutex; int event; struct list_head list; char *prealloc_buf; size_t atomic_write_len; bool mmapped:1; bool released:1; const struct vm_operations_struct *vm_ops; }; struct kernfs_ops { /* * Optional open/release methods. Both are called with * @of->seq_file populated. */ int (*open)(struct kernfs_open_file *of); void (*release)(struct kernfs_open_file *of); /* * Read is handled by either seq_file or raw_read(). * * If seq_show() is present, seq_file path is active. Other seq * operations are optional and if not implemented, the behavior is * equivalent to single_open(). @sf->private points to the * associated kernfs_open_file. * * read() is bounced through kernel buffer and a read larger than * PAGE_SIZE results in partial operation of PAGE_SIZE. */ int (*seq_show)(struct seq_file *sf, void *v); void *(*seq_start)(struct seq_file *sf, loff_t *ppos); void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); void (*seq_stop)(struct seq_file *sf, void *v); ssize_t (*read)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off); /* * write() is bounced through kernel buffer. If atomic_write_len * is not set, a write larger than PAGE_SIZE results in partial * operations of PAGE_SIZE chunks. If atomic_write_len is set, * writes upto the specified size are executed atomically but * larger ones are rejected with -E2BIG. */ size_t atomic_write_len; /* * "prealloc" causes a buffer to be allocated at open for * all read/write requests. As ->seq_show uses seq_read() * which does its own allocation, it is incompatible with * ->prealloc. Provide ->read and ->write with ->prealloc. */ bool prealloc; ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off); __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt); int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma); loff_t (*llseek)(struct kernfs_open_file *of, loff_t offset, int whence); }; /* * The kernfs superblock creation/mount parameter context. */ struct kernfs_fs_context { struct kernfs_root *root; /* Root of the hierarchy being mounted */ void *ns_tag; /* Namespace tag of the mount (or NULL) */ unsigned long magic; /* File system specific magic number */ /* The following are set/used by kernfs_mount() */ bool new_sb_created; /* Set to T if we allocated a new sb */ }; #ifdef CONFIG_KERNFS static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) { return kn->flags & KERNFS_TYPE_MASK; } static inline ino_t kernfs_id_ino(u64 id) { /* id is ino if ino_t is 64bit; otherwise, low 32bits */ if (sizeof(ino_t) >= sizeof(u64)) return id; else return (u32)id; } static inline u32 kernfs_id_gen(u64 id) { /* gen is fixed at 1 if ino_t is 64bit; otherwise, high 32bits */ if (sizeof(ino_t) >= sizeof(u64)) return 1; else return id >> 32; } static inline ino_t kernfs_ino(struct kernfs_node *kn) { return kernfs_id_ino(kn->id); } static inline ino_t kernfs_gen(struct kernfs_node *kn) { return kernfs_id_gen(kn->id); } /** * kernfs_enable_ns - enable namespace under a directory * @kn: directory of interest, should be empty * * This is to be called right after @kn is created to enable namespace * under it. All children of @kn must have non-NULL namespace tags and * only the ones which match the super_block's tag will be visible. */ static inline void kernfs_enable_ns(struct kernfs_node *kn) { WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR); WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children)); kn->flags |= KERNFS_NS; } /** * kernfs_ns_enabled - test whether namespace is enabled * @kn: the node to test * * Test whether namespace filtering is enabled for the children of @ns. */ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) { return kn->flags & KERNFS_NS; } int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, char *buf, size_t buflen); void pr_cont_kernfs_name(struct kernfs_node *kn); void pr_cont_kernfs_path(struct kernfs_node *kn); struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn); struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns); struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns); void kernfs_get(struct kernfs_node *kn); void kernfs_put(struct kernfs_node *kn); struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry); struct kernfs_root *kernfs_root_from_sb(struct super_block *sb); struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); struct dentry *kernfs_node_dentry(struct kernfs_node *kn, struct super_block *sb); struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv); void kernfs_destroy_root(struct kernfs_root *root); struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns); struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name); struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key); struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target); void kernfs_activate(struct kernfs_node *kn); void kernfs_show(struct kernfs_node *kn, bool show); void kernfs_remove(struct kernfs_node *kn); void kernfs_break_active_protection(struct kernfs_node *kn); void kernfs_unbreak_active_protection(struct kernfs_node *kn); bool kernfs_remove_self(struct kernfs_node *kn); int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns); int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns); int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); __poll_t kernfs_generic_poll(struct kernfs_open_file *of, struct poll_table_struct *pt); void kernfs_notify(struct kernfs_node *kn); int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size); int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags); const void *kernfs_super_ns(struct super_block *sb); int kernfs_get_tree(struct fs_context *fc); void kernfs_free_fs_context(struct fs_context *fc); void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, u64 id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) { return 0; } /* whatever */ static inline void kernfs_enable_ns(struct kernfs_node *kn) { } static inline bool kernfs_ns_enabled(struct kernfs_node *kn) { return false; } static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } static inline int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { } static inline void pr_cont_kernfs_path(struct kernfs_node *kn) { } static inline struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { return NULL; } static inline struct kernfs_node * kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { return NULL; } static inline struct kernfs_node * kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { return NULL; } static inline void kernfs_get(struct kernfs_node *kn) { } static inline void kernfs_put(struct kernfs_node *kn) { } static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { return NULL; } static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) { return NULL; } static inline struct inode * kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { return NULL; } static inline struct kernfs_root * kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) { return ERR_PTR(-ENOSYS); } static inline void kernfs_destroy_root(struct kernfs_root *root) { } static inline struct kernfs_node * kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * __kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target) { return ERR_PTR(-ENOSYS); } static inline void kernfs_activate(struct kernfs_node *kn) { } static inline void kernfs_remove(struct kernfs_node *kn) { } static inline bool kernfs_remove_self(struct kernfs_node *kn) { return false; } static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn, const char *name, const void *ns) { return -ENOSYS; } static inline int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { return -ENOSYS; } static inline int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { return -ENOSYS; } static inline __poll_t kernfs_generic_poll(struct kernfs_open_file *of, struct poll_table_struct *pt) { return -ENOSYS; } static inline void kernfs_notify(struct kernfs_node *kn) { } static inline int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size) { return -ENOSYS; } static inline int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { return -ENOSYS; } static inline const void *kernfs_super_ns(struct super_block *sb) { return NULL; } static inline int kernfs_get_tree(struct fs_context *fc) { return -ENOSYS; } static inline void kernfs_free_fs_context(struct fs_context *fc) { } static inline void kernfs_kill_sb(struct super_block *sb) { } static inline void kernfs_init(void) { } #endif /* CONFIG_KERNFS */ /** * kernfs_path - build full path of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into * @buflen: size of @buf * * If @kn is NULL result will be "(null)". * * Returns the length of the full path. If the full length is equal to or * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ static inline int kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) { return kernfs_path_from_node(kn, NULL, buf, buflen); } static inline struct kernfs_node * kernfs_find_and_get(struct kernfs_node *kn, const char *name) { return kernfs_find_and_get_ns(kn, name, NULL); } static inline struct kernfs_node * kernfs_walk_and_get(struct kernfs_node *kn, const char *path) { return kernfs_walk_and_get_ns(kn, path, NULL); } static inline struct kernfs_node * kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode, void *priv) { return kernfs_create_dir_ns(parent, name, mode, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, priv, NULL); } static inline int kernfs_remove_by_name(struct kernfs_node *parent, const char *name) { return kernfs_remove_by_name_ns(parent, name, NULL); } static inline int kernfs_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name) { return kernfs_rename_ns(kn, new_parent, new_name, NULL); } #endif /* __LINUX_KERNFS_H */
1 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 // SPDX-License-Identifier: GPL-2.0-or-later /* System trusted keyring for trusted public keys * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/uidgid.h> #include <linux/verification.h> #include <keys/asymmetric-type.h> #include <keys/system_keyring.h> #include <crypto/pkcs7.h> static struct key *builtin_trusted_keys; #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING static struct key *secondary_trusted_keys; #endif #ifdef CONFIG_INTEGRITY_MACHINE_KEYRING static struct key *machine_trusted_keys; #endif #ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING static struct key *platform_trusted_keys; #endif extern __initconst const u8 system_certificate_list[]; extern __initconst const unsigned long system_certificate_list_size; extern __initconst const unsigned long module_cert_size; /** * restrict_link_by_builtin_trusted - Restrict keyring addition by built-in CA * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restriction_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in the built in system keyring. */ int restrict_link_by_builtin_trusted(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return restrict_link_by_signature(dest_keyring, type, payload, builtin_trusted_keys); } /** * restrict_link_by_digsig_builtin - Restrict digitalSignature key additions by the built-in keyring * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restriction_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in the built in system keyring. The new key * must have the digitalSignature usage field set. */ int restrict_link_by_digsig_builtin(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return restrict_link_by_digsig(dest_keyring, type, payload, builtin_trusted_keys); } #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING /** * restrict_link_by_builtin_and_secondary_trusted - Restrict keyring * addition by both built-in and secondary keyrings. * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restrict_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in either the built-in or the secondary system * keyrings. */ int restrict_link_by_builtin_and_secondary_trusted( struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restrict_key) { /* If we have a secondary trusted keyring, then that contains a link * through to the builtin keyring and the search will follow that link. */ if (type == &key_type_keyring && dest_keyring == secondary_trusted_keys && payload == &builtin_trusted_keys->payload) /* Allow the builtin keyring to be added to the secondary */ return 0; return restrict_link_by_signature(dest_keyring, type, payload, secondary_trusted_keys); } /** * restrict_link_by_digsig_builtin_and_secondary - Restrict by digitalSignature. * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restrict_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in either the built-in or the secondary system * keyrings. The new key must have the digitalSignature usage field set. */ int restrict_link_by_digsig_builtin_and_secondary(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restrict_key) { /* If we have a secondary trusted keyring, then that contains a link * through to the builtin keyring and the search will follow that link. */ if (type == &key_type_keyring && dest_keyring == secondary_trusted_keys && payload == &builtin_trusted_keys->payload) /* Allow the builtin keyring to be added to the secondary */ return 0; return restrict_link_by_digsig(dest_keyring, type, payload, secondary_trusted_keys); } /* * Allocate a struct key_restriction for the "builtin and secondary trust" * keyring. Only for use in system_trusted_keyring_init(). */ static __init struct key_restriction *get_builtin_and_secondary_restriction(void) { struct key_restriction *restriction; restriction = kzalloc(sizeof(struct key_restriction), GFP_KERNEL); if (!restriction) panic("Can't allocate secondary trusted keyring restriction\n"); if (IS_ENABLED(CONFIG_INTEGRITY_MACHINE_KEYRING)) restriction->check = restrict_link_by_builtin_secondary_and_machine; else restriction->check = restrict_link_by_builtin_and_secondary_trusted; return restriction; } /** * add_to_secondary_keyring - Add to secondary keyring. * @source: Source of key * @data: The blob holding the key * @len: The length of the data blob * * Add a key to the secondary keyring. The key must be vouched for by a key in the builtin, * machine or secondary keyring itself. */ void __init add_to_secondary_keyring(const char *source, const void *data, size_t len) { key_ref_t key; key_perm_t perm; perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW; key = key_create_or_update(make_key_ref(secondary_trusted_keys, 1), "asymmetric", NULL, data, len, perm, KEY_ALLOC_NOT_IN_QUOTA); if (IS_ERR(key)) { pr_err("Problem loading X.509 certificate from %s to secondary keyring %ld\n", source, PTR_ERR(key)); return; } pr_notice("Loaded X.509 cert '%s'\n", key_ref_to_ptr(key)->description); key_ref_put(key); } #endif #ifdef CONFIG_INTEGRITY_MACHINE_KEYRING void __init set_machine_trusted_keys(struct key *keyring) { machine_trusted_keys = keyring; if (key_link(secondary_trusted_keys, machine_trusted_keys) < 0) panic("Can't link (machine) trusted keyrings\n"); } /** * restrict_link_by_builtin_secondary_and_machine - Restrict keyring addition. * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restrict_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in either the built-in, the secondary, or * the machine keyrings. */ int restrict_link_by_builtin_secondary_and_machine( struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restrict_key) { if (machine_trusted_keys && type == &key_type_keyring && dest_keyring == secondary_trusted_keys && payload == &machine_trusted_keys->payload) /* Allow the machine keyring to be added to the secondary */ return 0; return restrict_link_by_builtin_and_secondary_trusted(dest_keyring, type, payload, restrict_key); } #endif /* * Create the trusted keyrings */ static __init int system_trusted_keyring_init(void) { pr_notice("Initialise system trusted keyrings\n"); builtin_trusted_keys = keyring_alloc(".builtin_trusted_keys", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), ((KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(builtin_trusted_keys)) panic("Can't allocate builtin trusted keyring\n"); #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING secondary_trusted_keys = keyring_alloc(".secondary_trusted_keys", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), ((KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH | KEY_USR_WRITE), KEY_ALLOC_NOT_IN_QUOTA, get_builtin_and_secondary_restriction(), NULL); if (IS_ERR(secondary_trusted_keys)) panic("Can't allocate secondary trusted keyring\n"); if (key_link(secondary_trusted_keys, builtin_trusted_keys) < 0) panic("Can't link trusted keyrings\n"); #endif return 0; } /* * Must be initialised before we try and load the keys into the keyring. */ device_initcall(system_trusted_keyring_init); __init int load_module_cert(struct key *keyring) { if (!IS_ENABLED(CONFIG_IMA_APPRAISE_MODSIG)) return 0; pr_notice("Loading compiled-in module X.509 certificates\n"); return x509_load_certificate_list(system_certificate_list, module_cert_size, keyring); } /* * Load the compiled-in list of X.509 certificates. */ static __init int load_system_certificate_list(void) { const u8 *p; unsigned long size; pr_notice("Loading compiled-in X.509 certificates\n"); #ifdef CONFIG_MODULE_SIG p = system_certificate_list; size = system_certificate_list_size; #else p = system_certificate_list + module_cert_size; size = system_certificate_list_size - module_cert_size; #endif return x509_load_certificate_list(p, size, builtin_trusted_keys); } late_initcall(load_system_certificate_list); #ifdef CONFIG_SYSTEM_DATA_VERIFICATION /** * verify_pkcs7_message_sig - Verify a PKCS#7-based signature on system data. * @data: The data to be verified (NULL if expecting internal data). * @len: Size of @data. * @pkcs7: The PKCS#7 message that is the signature. * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only, * (void *)1UL for all trusted keys). * @usage: The use to which the key is being put. * @view_content: Callback to gain access to content. * @ctx: Context for callback. */ int verify_pkcs7_message_sig(const void *data, size_t len, struct pkcs7_message *pkcs7, struct key *trusted_keys, enum key_being_used_for usage, int (*view_content)(void *ctx, const void *data, size_t len, size_t asn1hdrlen), void *ctx) { int ret; /* The data should be detached - so we need to supply it. */ if (data && pkcs7_supply_detached_data(pkcs7, data, len) < 0) { pr_err("PKCS#7 signature with non-detached data\n"); ret = -EBADMSG; goto error; } ret = pkcs7_verify(pkcs7, usage); if (ret < 0) goto error; ret = is_key_on_revocation_list(pkcs7); if (ret != -ENOKEY) { pr_devel("PKCS#7 key is on revocation list\n"); goto error; } if (!trusted_keys) { trusted_keys = builtin_trusted_keys; } else if (trusted_keys == VERIFY_USE_SECONDARY_KEYRING) { #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING trusted_keys = secondary_trusted_keys; #else trusted_keys = builtin_trusted_keys; #endif } else if (trusted_keys == VERIFY_USE_PLATFORM_KEYRING) { #ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING trusted_keys = platform_trusted_keys; #else trusted_keys = NULL; #endif if (!trusted_keys) { ret = -ENOKEY; pr_devel("PKCS#7 platform keyring is not available\n"); goto error; } } ret = pkcs7_validate_trust(pkcs7, trusted_keys); if (ret < 0) { if (ret == -ENOKEY) pr_devel("PKCS#7 signature not signed with a trusted key\n"); goto error; } if (view_content) { size_t asn1hdrlen; ret = pkcs7_get_content_data(pkcs7, &data, &len, &asn1hdrlen); if (ret < 0) { if (ret == -ENODATA) pr_devel("PKCS#7 message does not contain data\n"); goto error; } ret = view_content(ctx, data, len, asn1hdrlen); } error: pr_devel("<==%s() = %d\n", __func__, ret); return ret; } /** * verify_pkcs7_signature - Verify a PKCS#7-based signature on system data. * @data: The data to be verified (NULL if expecting internal data). * @len: Size of @data. * @raw_pkcs7: The PKCS#7 message that is the signature. * @pkcs7_len: The size of @raw_pkcs7. * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only, * (void *)1UL for all trusted keys). * @usage: The use to which the key is being put. * @view_content: Callback to gain access to content. * @ctx: Context for callback. */ int verify_pkcs7_signature(const void *data, size_t len, const void *raw_pkcs7, size_t pkcs7_len, struct key *trusted_keys, enum key_being_used_for usage, int (*view_content)(void *ctx, const void *data, size_t len, size_t asn1hdrlen), void *ctx) { struct pkcs7_message *pkcs7; int ret; pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len); if (IS_ERR(pkcs7)) return PTR_ERR(pkcs7); ret = verify_pkcs7_message_sig(data, len, pkcs7, trusted_keys, usage, view_content, ctx); pkcs7_free_message(pkcs7); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } EXPORT_SYMBOL_GPL(verify_pkcs7_signature); #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ #ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING void __init set_platform_trusted_keys(struct key *keyring) { platform_trusted_keys = keyring; } #endif
1157 365 86 54 377 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 /* SPDX-License-Identifier: GPL-2.0 */ /* * workqueue.h --- work queue handling for Linux. */ #ifndef _LINUX_WORKQUEUE_H #define _LINUX_WORKQUEUE_H #include <linux/timer.h> #include <linux/linkage.h> #include <linux/bitops.h> #include <linux/lockdep.h> #include <linux/threads.h> #include <linux/atomic.h> #include <linux/cpumask.h> #include <linux/rcupdate.h> #include <linux/workqueue_types.h> /* * The first word is the work queue pointer and the flags rolled into * one */ #define work_data_bits(work) ((unsigned long *)(&(work)->data)) enum work_bits { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ WORK_STRUCT_INACTIVE_BIT, /* work item is inactive */ WORK_STRUCT_PWQ_BIT, /* data points to pwq */ WORK_STRUCT_LINKED_BIT, /* next work is linked to this one */ #ifdef CONFIG_DEBUG_OBJECTS_WORK WORK_STRUCT_STATIC_BIT, /* static initializer (debugobjects) */ #endif WORK_STRUCT_FLAG_BITS, /* color for workqueue flushing */ WORK_STRUCT_COLOR_SHIFT = WORK_STRUCT_FLAG_BITS, WORK_STRUCT_COLOR_BITS = 4, /* * When WORK_STRUCT_PWQ is set, reserve 8 bits off of pwq pointer w/ * debugobjects turned off. This makes pwqs aligned to 256 bytes (512 * bytes w/ DEBUG_OBJECTS_WORK) and allows 16 workqueue flush colors. * * MSB * [ pwq pointer ] [ flush color ] [ STRUCT flags ] * 4 bits 4 or 5 bits */ WORK_STRUCT_PWQ_SHIFT = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS, /* * data contains off-queue information when !WORK_STRUCT_PWQ. * * MSB * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ] * 16 bits 1 bit 4 or 5 bits */ WORK_OFFQ_FLAG_SHIFT = WORK_STRUCT_FLAG_BITS, WORK_OFFQ_BH_BIT = WORK_OFFQ_FLAG_SHIFT, WORK_OFFQ_FLAG_END, WORK_OFFQ_FLAG_BITS = WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT, WORK_OFFQ_DISABLE_SHIFT = WORK_OFFQ_FLAG_SHIFT + WORK_OFFQ_FLAG_BITS, WORK_OFFQ_DISABLE_BITS = 16, /* * When a work item is off queue, the high bits encode off-queue flags * and the last pool it was on. Cap pool ID to 31 bits and use the * highest number to indicate that no pool is associated. */ WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_DISABLE_SHIFT + WORK_OFFQ_DISABLE_BITS, WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, }; enum work_flags { WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, WORK_STRUCT_INACTIVE = 1 << WORK_STRUCT_INACTIVE_BIT, WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, #ifdef CONFIG_DEBUG_OBJECTS_WORK WORK_STRUCT_STATIC = 1 << WORK_STRUCT_STATIC_BIT, #else WORK_STRUCT_STATIC = 0, #endif }; enum wq_misc_consts { WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS), /* not bound to any CPU, prefer the local CPU */ WORK_CPU_UNBOUND = NR_CPUS, /* bit mask for work_busy() return values */ WORK_BUSY_PENDING = 1 << 0, WORK_BUSY_RUNNING = 1 << 1, /* maximum string length for set_worker_desc() */ WORKER_DESC_LEN = 32, }; /* Convenience constants - of type 'unsigned long', not 'enum'! */ #define WORK_OFFQ_BH (1ul << WORK_OFFQ_BH_BIT) #define WORK_OFFQ_FLAG_MASK (((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT) #define WORK_OFFQ_DISABLE_MASK (((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT) #define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) #define WORK_STRUCT_NO_POOL (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT) #define WORK_STRUCT_PWQ_MASK (~((1ul << WORK_STRUCT_PWQ_SHIFT) - 1)) #define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL) #define WORK_DATA_STATIC_INIT() \ ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)) struct delayed_work { struct work_struct work; struct timer_list timer; /* target workqueue and CPU ->timer uses to queue ->work */ struct workqueue_struct *wq; int cpu; }; struct rcu_work { struct work_struct work; struct rcu_head rcu; /* target workqueue ->rcu uses to queue ->work */ struct workqueue_struct *wq; }; enum wq_affn_scope { WQ_AFFN_DFL, /* use system default */ WQ_AFFN_CPU, /* one pod per CPU */ WQ_AFFN_SMT, /* one pod poer SMT */ WQ_AFFN_CACHE, /* one pod per LLC */ WQ_AFFN_NUMA, /* one pod per NUMA node */ WQ_AFFN_SYSTEM, /* one pod across the whole system */ WQ_AFFN_NR_TYPES, }; /** * struct workqueue_attrs - A struct for workqueue attributes. * * This can be used to change attributes of an unbound workqueue. */ struct workqueue_attrs { /** * @nice: nice level */ int nice; /** * @cpumask: allowed CPUs * * Work items in this workqueue are affine to these CPUs and not allowed * to execute on other CPUs. A pool serving a workqueue must have the * same @cpumask. */ cpumask_var_t cpumask; /** * @__pod_cpumask: internal attribute used to create per-pod pools * * Internal use only. * * Per-pod unbound worker pools are used to improve locality. Always a * subset of ->cpumask. A workqueue can be associated with multiple * worker pools with disjoint @__pod_cpumask's. Whether the enforcement * of a pool's @__pod_cpumask is strict depends on @affn_strict. */ cpumask_var_t __pod_cpumask; /** * @affn_strict: affinity scope is strict * * If clear, workqueue will make a best-effort attempt at starting the * worker inside @__pod_cpumask but the scheduler is free to migrate it * outside. * * If set, workers are only allowed to run inside @__pod_cpumask. */ bool affn_strict; /* * Below fields aren't properties of a worker_pool. They only modify how * :c:func:`apply_workqueue_attrs` select pools and thus don't * participate in pool hash calculations or equality comparisons. * * If @affn_strict is set, @cpumask isn't a property of a worker_pool * either. */ /** * @affn_scope: unbound CPU affinity scope * * CPU pods are used to improve execution locality of unbound work * items. There are multiple pod types, one for each wq_affn_scope, and * every CPU in the system belongs to one pod in every pod type. CPUs * that belong to the same pod share the worker pool. For example, * selecting %WQ_AFFN_NUMA makes the workqueue use a separate worker * pool for each NUMA node. */ enum wq_affn_scope affn_scope; /** * @ordered: work items must be executed one by one in queueing order */ bool ordered; }; static inline struct delayed_work *to_delayed_work(struct work_struct *work) { return container_of(work, struct delayed_work, work); } static inline struct rcu_work *to_rcu_work(struct work_struct *work) { return container_of(work, struct rcu_work, work); } struct execute_work { struct work_struct work; }; #ifdef CONFIG_LOCKDEP /* * NB: because we have to copy the lockdep_map, setting _key * here is required, otherwise it could get initialised to the * copy of the lockdep_map! */ #define __WORK_INIT_LOCKDEP_MAP(n, k) \ .lockdep_map = STATIC_LOCKDEP_MAP_INIT(n, k), #else #define __WORK_INIT_LOCKDEP_MAP(n, k) #endif #define __WORK_INITIALIZER(n, f) { \ .data = WORK_DATA_STATIC_INIT(), \ .entry = { &(n).entry, &(n).entry }, \ .func = (f), \ __WORK_INIT_LOCKDEP_MAP(#n, &(n)) \ } #define __DELAYED_WORK_INITIALIZER(n, f, tflags) { \ .work = __WORK_INITIALIZER((n).work, (f)), \ .timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\ (tflags) | TIMER_IRQSAFE), \ } #define DECLARE_WORK(n, f) \ struct work_struct n = __WORK_INITIALIZER(n, f) #define DECLARE_DELAYED_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0) #define DECLARE_DEFERRABLE_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, TIMER_DEFERRABLE) #ifdef CONFIG_DEBUG_OBJECTS_WORK extern void __init_work(struct work_struct *work, int onstack); extern void destroy_work_on_stack(struct work_struct *work); extern void destroy_delayed_work_on_stack(struct delayed_work *work); static inline unsigned int work_static(struct work_struct *work) { return *work_data_bits(work) & WORK_STRUCT_STATIC; } #else static inline void __init_work(struct work_struct *work, int onstack) { } static inline void destroy_work_on_stack(struct work_struct *work) { } static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { } static inline unsigned int work_static(struct work_struct *work) { return 0; } #endif /* * initialize all of a work item in one go * * NOTE! No point in using "atomic_long_set()": using a direct * assignment of the work data initializer allows the compiler * to generate better code. */ #ifdef CONFIG_LOCKDEP #define __INIT_WORK_KEY(_work, _func, _onstack, _key) \ do { \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, (_key), 0); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #else #define __INIT_WORK_KEY(_work, _func, _onstack, _key) \ do { \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #endif #define __INIT_WORK(_work, _func, _onstack) \ do { \ static __maybe_unused struct lock_class_key __key; \ \ __INIT_WORK_KEY(_work, _func, _onstack, &__key); \ } while (0) #define INIT_WORK(_work, _func) \ __INIT_WORK((_work), (_func), 0) #define INIT_WORK_ONSTACK(_work, _func) \ __INIT_WORK((_work), (_func), 1) #define INIT_WORK_ONSTACK_KEY(_work, _func, _key) \ __INIT_WORK_KEY((_work), (_func), 1, _key) #define __INIT_DELAYED_WORK(_work, _func, _tflags) \ do { \ INIT_WORK(&(_work)->work, (_func)); \ __init_timer(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) #define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags) \ do { \ INIT_WORK_ONSTACK(&(_work)->work, (_func)); \ __init_timer_on_stack(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) #define INIT_DELAYED_WORK(_work, _func) \ __INIT_DELAYED_WORK(_work, _func, 0) #define INIT_DELAYED_WORK_ONSTACK(_work, _func) \ __INIT_DELAYED_WORK_ONSTACK(_work, _func, 0) #define INIT_DEFERRABLE_WORK(_work, _func) \ __INIT_DELAYED_WORK(_work, _func, TIMER_DEFERRABLE) #define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func) \ __INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE) #define INIT_RCU_WORK(_work, _func) \ INIT_WORK(&(_work)->work, (_func)) #define INIT_RCU_WORK_ONSTACK(_work, _func) \ INIT_WORK_ONSTACK(&(_work)->work, (_func)) /** * work_pending - Find out whether a work item is currently pending * @work: The work item in question */ #define work_pending(work) \ test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) /** * delayed_work_pending - Find out whether a delayable work item is currently * pending * @w: The work item in question */ #define delayed_work_pending(w) \ work_pending(&(w)->work) /* * Workqueue flags and constants. For details, please refer to * Documentation/core-api/workqueue.rst. */ enum wq_flags { WQ_BH = 1 << 0, /* execute in bottom half (softirq) context */ WQ_UNBOUND = 1 << 1, /* not bound to any cpu */ WQ_FREEZABLE = 1 << 2, /* freeze during suspend */ WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu intensive workqueue */ WQ_SYSFS = 1 << 6, /* visible in sysfs, see workqueue_sysfs_register() */ /* * Per-cpu workqueues are generally preferred because they tend to * show better performance thanks to cache locality. Per-cpu * workqueues exclude the scheduler from choosing the CPU to * execute the worker threads, which has an unfortunate side effect * of increasing power consumption. * * The scheduler considers a CPU idle if it doesn't have any task * to execute and tries to keep idle cores idle to conserve power; * however, for example, a per-cpu work item scheduled from an * interrupt handler on an idle CPU will force the scheduler to * execute the work item on that CPU breaking the idleness, which in * turn may lead to more scheduling choices which are sub-optimal * in terms of power consumption. * * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default * but become unbound if workqueue.power_efficient kernel param is * specified. Per-cpu workqueues which are identified to * contribute significantly to power-consumption are identified and * marked with this flag and enabling the power_efficient mode * leads to noticeable power saving at the cost of small * performance disadvantage. * * http://thread.gmane.org/gmane.linux.kernel/1480396 */ WQ_POWER_EFFICIENT = 1 << 7, __WQ_DESTROYING = 1 << 15, /* internal: workqueue is destroying */ __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ /* BH wq only allows the following flags */ __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI, }; enum wq_consts { WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE, WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, /* * Per-node default cap on min_active. Unless explicitly set, min_active * is set to min(max_active, WQ_DFL_MIN_ACTIVE). For more details, see * workqueue_struct->min_active definition. */ WQ_DFL_MIN_ACTIVE = 8, }; /* * System-wide workqueues which are always present. * * system_wq is the one used by schedule[_delayed]_work[_on](). * Multi-CPU multi-threaded. There are users which expect relatively * short queue flush time. Don't queue works which can run for too * long. * * system_highpri_wq is similar to system_wq but for work items which * require WQ_HIGHPRI. * * system_long_wq is similar to system_wq but may host long running * works. Queue flushing might take relatively long. * * system_unbound_wq is unbound workqueue. Workers are not bound to * any specific CPU, not concurrency managed, and all queued works are * executed immediately as long as max_active limit is not reached and * resources are available. * * system_freezable_wq is equivalent to system_wq except that it's * freezable. * * *_power_efficient_wq are inclined towards saving power and converted * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, * they are same as their non-power-efficient counterparts - e.g. * system_power_efficient_wq is identical to system_wq if * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. * * system_bh[_highpri]_wq are convenience interface to softirq. BH work items * are executed in the queueing CPU's BH context in the queueing order. */ extern struct workqueue_struct *system_wq; extern struct workqueue_struct *system_highpri_wq; extern struct workqueue_struct *system_long_wq; extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; extern struct workqueue_struct *system_power_efficient_wq; extern struct workqueue_struct *system_freezable_power_efficient_wq; extern struct workqueue_struct *system_bh_wq; extern struct workqueue_struct *system_bh_highpri_wq; void workqueue_softirq_action(bool highpri); void workqueue_softirq_dead(unsigned int cpu); /** * alloc_workqueue - allocate a workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags * @max_active: max in-flight work items, 0 for default * @...: args for @fmt * * For a per-cpu workqueue, @max_active limits the number of in-flight work * items for each CPU. e.g. @max_active of 1 indicates that each CPU can be * executing at most one work item for the workqueue. * * For unbound workqueues, @max_active limits the number of in-flight work items * for the whole system. e.g. @max_active of 16 indicates that that there can be * at most 16 work items executing for the workqueue in the whole system. * * As sharing the same active counter for an unbound workqueue across multiple * NUMA nodes can be expensive, @max_active is distributed to each NUMA node * according to the proportion of the number of online CPUs and enforced * independently. * * Depending on online CPU distribution, a node may end up with per-node * max_active which is significantly lower than @max_active, which can lead to * deadlocks if the per-node concurrency limit is lower than the maximum number * of interdependent work items for the workqueue. * * To guarantee forward progress regardless of online CPU distribution, the * concurrency limit on every node is guaranteed to be equal to or greater than * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means * that the sum of per-node max_active's may be larger than @max_active. * * For detailed information on %WQ_* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ __printf(1, 4) struct workqueue_struct * alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); /** * alloc_ordered_workqueue - allocate an ordered workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) * @args: args for @fmt * * Allocate an ordered workqueue. An ordered workqueue executes at * most one work item at any given time in the queued order. They are * implemented as unbound workqueues with @max_active of one. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) #define create_freezable_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ WQ_MEM_RECLAIM, 1, (name)) #define create_singlethread_workqueue(name) \ alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name) #define from_work(var, callback_work, work_fieldname) \ container_of(callback_work, typeof(*var), work_fieldname) extern void destroy_workqueue(struct workqueue_struct *wq); struct workqueue_attrs *alloc_workqueue_attrs(void); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_work_node(int node, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay); extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork); extern void __flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); extern bool flush_work(struct work_struct *work); extern bool cancel_work(struct work_struct *work); extern bool cancel_work_sync(struct work_struct *work); extern bool flush_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work_sync(struct delayed_work *dwork); extern bool disable_work(struct work_struct *work); extern bool disable_work_sync(struct work_struct *work); extern bool enable_work(struct work_struct *work); extern bool disable_delayed_work(struct delayed_work *dwork); extern bool disable_delayed_work_sync(struct delayed_work *dwork); extern bool enable_delayed_work(struct delayed_work *dwork); extern bool flush_rcu_work(struct rcu_work *rwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); extern void workqueue_set_min_active(struct workqueue_struct *wq, int min_active); extern struct work_struct *current_work(void); extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); extern __printf(1, 2) void set_worker_desc(const char *fmt, ...); extern void print_worker_info(const char *log_lvl, struct task_struct *task); extern void show_all_workqueues(void); extern void show_freezable_workqueues(void); extern void show_one_workqueue(struct workqueue_struct *wq); extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task); /** * queue_work - queue work on a workqueue * @wq: workqueue to use * @work: work to queue * * Returns %false if @work was already on a queue, %true otherwise. * * We queue the work to the CPU on which it was submitted, but if the CPU dies * it can be processed by another CPU. * * Memory-ordering properties: If it returns %true, guarantees that all stores * preceding the call to queue_work() in the program order will be visible from * the CPU which will execute @work by the time such work executes, e.g., * * { x is initially 0 } * * CPU0 CPU1 * * WRITE_ONCE(x, 1); [ @work is being executed ] * r0 = queue_work(wq, work); r1 = READ_ONCE(x); * * Forbids: r0 == true && r1 == 0 */ static inline bool queue_work(struct workqueue_struct *wq, struct work_struct *work) { return queue_work_on(WORK_CPU_UNBOUND, wq, work); } /** * queue_delayed_work - queue work on a workqueue after delay * @wq: workqueue to use * @dwork: delayable work to queue * @delay: number of jiffies to wait before queueing * * Equivalent to queue_delayed_work_on() but tries to use the local CPU. */ static inline bool queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } /** * mod_delayed_work - modify delay of or queue a delayed work * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * mod_delayed_work_on() on local CPU. */ static inline bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } /** * schedule_work_on - put work task on a specific cpu * @cpu: cpu to put the work task on * @work: job to be done * * This puts a job on a specific cpu */ static inline bool schedule_work_on(int cpu, struct work_struct *work) { return queue_work_on(cpu, system_wq, work); } /** * schedule_work - put work task in global workqueue * @work: job to be done * * Returns %false if @work was already on the kernel-global workqueue and * %true otherwise. * * This puts a job in the kernel-global workqueue if it was not already * queued and leaves it in the same position on the kernel-global * workqueue otherwise. * * Shares the same memory-ordering properties of queue_work(), cf. the * DocBook header of queue_work(). */ static inline bool schedule_work(struct work_struct *work) { return queue_work(system_wq, work); } /** * enable_and_queue_work - Enable and queue a work item on a specific workqueue * @wq: The target workqueue * @work: The work item to be enabled and queued * * This function combines the operations of enable_work() and queue_work(), * providing a convenient way to enable and queue a work item in a single call. * It invokes enable_work() on @work and then queues it if the disable depth * reached 0. Returns %true if the disable depth reached 0 and @work is queued, * and %false otherwise. * * Note that @work is always queued when disable depth reaches zero. If the * desired behavior is queueing only if certain events took place while @work is * disabled, the user should implement the necessary state tracking and perform * explicit conditional queueing after enable_work(). */ static inline bool enable_and_queue_work(struct workqueue_struct *wq, struct work_struct *work) { if (enable_work(work)) { queue_work(wq, work); return true; } return false; } /* * Detect attempt to flush system-wide workqueues at compile time when possible. * Warn attempt to flush system-wide workqueues at runtime. * * See https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp * for reasons and steps for converting system-wide workqueues into local workqueues. */ extern void __warn_flushing_systemwide_wq(void) __compiletime_warning("Please avoid flushing system-wide workqueues."); /* Please stop using this function, for this function will be removed in near future. */ #define flush_scheduled_work() \ ({ \ __warn_flushing_systemwide_wq(); \ __flush_workqueue(system_wq); \ }) #define flush_workqueue(wq) \ ({ \ struct workqueue_struct *_wq = (wq); \ \ if ((__builtin_constant_p(_wq == system_wq) && \ _wq == system_wq) || \ (__builtin_constant_p(_wq == system_highpri_wq) && \ _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ _wq == system_long_wq) || \ (__builtin_constant_p(_wq == system_unbound_wq) && \ _wq == system_unbound_wq) || \ (__builtin_constant_p(_wq == system_freezable_wq) && \ _wq == system_freezable_wq) || \ (__builtin_constant_p(_wq == system_power_efficient_wq) && \ _wq == system_power_efficient_wq) || \ (__builtin_constant_p(_wq == system_freezable_power_efficient_wq) && \ _wq == system_freezable_power_efficient_wq)) \ __warn_flushing_systemwide_wq(); \ __flush_workqueue(_wq); \ }) /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done * @delay: number of jiffies to wait * * After waiting for a given time this puts a job in the kernel-global * workqueue on the specified CPU. */ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work_on(cpu, system_wq, dwork, delay); } /** * schedule_delayed_work - put work task in global workqueue after delay * @dwork: job to be done * @delay: number of jiffies to wait or 0 for immediate execution * * After waiting for a given time this puts a job in the kernel-global * workqueue. */ static inline bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work(system_wq, dwork, delay); } #ifndef CONFIG_SMP static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } #else long work_on_cpu_key(int cpu, long (*fn)(void *), void *arg, struct lock_class_key *key); /* * A new key is defined for each caller to make sure the work * associated with the function doesn't share its locking class. */ #define work_on_cpu(_cpu, _fn, _arg) \ ({ \ static struct lock_class_key __key; \ \ work_on_cpu_key(_cpu, _fn, _arg, &__key); \ }) long work_on_cpu_safe_key(int cpu, long (*fn)(void *), void *arg, struct lock_class_key *key); /* * A new key is defined for each caller to make sure the work * associated with the function doesn't share its locking class. */ #define work_on_cpu_safe(_cpu, _fn, _arg) \ ({ \ static struct lock_class_key __key; \ \ work_on_cpu_safe_key(_cpu, _fn, _arg, &__key); \ }) #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER extern void freeze_workqueues_begin(void); extern bool freeze_workqueues_busy(void); extern void thaw_workqueues(void); #endif /* CONFIG_FREEZER */ #ifdef CONFIG_SYSFS int workqueue_sysfs_register(struct workqueue_struct *wq); #else /* CONFIG_SYSFS */ static inline int workqueue_sysfs_register(struct workqueue_struct *wq) { return 0; } #endif /* CONFIG_SYSFS */ #ifdef CONFIG_WQ_WATCHDOG void wq_watchdog_touch(int cpu); #else /* CONFIG_WQ_WATCHDOG */ static inline void wq_watchdog_touch(int cpu) { } #endif /* CONFIG_WQ_WATCHDOG */ #ifdef CONFIG_SMP int workqueue_prepare_cpu(unsigned int cpu); int workqueue_online_cpu(unsigned int cpu); int workqueue_offline_cpu(unsigned int cpu); #endif void __init workqueue_init_early(void); void __init workqueue_init(void); void __init workqueue_init_topology(void); #endif
2 3 3 3 2 1 1 1 1 3 1 1 1 1 3 1 1 1 1 1 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/udp.h> #include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> #include <net/checksum.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> static const unsigned int udp_timeouts[UDP_CT_MAX] = { [UDP_CT_UNREPLIED] = 30*HZ, [UDP_CT_REPLIED] = 120*HZ, }; static unsigned int *udp_get_timeouts(struct net *net) { return nf_udp_pernet(net)->timeouts; } static void udp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_UDP, "%s", msg); } static bool udp_error(struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { unsigned int udplen = skb->len - dataoff; const struct udphdr *hdr; struct udphdr _hdr; /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (!hdr) { udp_error_log(skb, state, "short packet"); return true; } /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { udp_error_log(skb, state, "truncated/malformed packet"); return true; } /* Packet with no checksum */ if (!hdr->check) return false; /* Checksum invalid? Ignore. * We skip checking packets on the outgoing path * because the checksum is assumed to be correct. * FIXME: Source route IP option packets --RR */ if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && nf_checksum(skb, state->hook, dataoff, IPPROTO_UDP, state->pf)) { udp_error_log(skb, state, "bad checksum"); return true; } return false; } /* Returns verdict for packet, and may modify conntracktype */ int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { unsigned int *timeouts; unsigned long status; if (udp_error(skb, dataoff, state)) return -NF_ACCEPT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); status = READ_ONCE(ct->status); if ((status & IPS_CONFIRMED) == 0) ct->proto.udp.stream_ts = 2 * HZ + jiffies; /* If we've seen traffic both ways, this is some kind of UDP * stream. Set Assured. */ if (status & IPS_SEEN_REPLY) { unsigned long extra = timeouts[UDP_CT_UNREPLIED]; bool stream = false; /* Still active after two seconds? Extend timeout. */ if (time_after(jiffies, ct->proto.udp.stream_ts)) { extra = timeouts[UDP_CT_REPLIED]; stream = (status & IPS_ASSURED) == 0; } nf_ct_refresh_acct(ct, ctinfo, skb, extra); /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ if (unlikely((status & IPS_NAT_CLASH))) return NF_ACCEPT; /* Also, more likely to be important, and not a probe */ if (stream && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } #ifdef CONFIG_NF_CT_PROTO_UDPLITE static void udplite_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_UDPLITE, "%s", msg); } static bool udplite_error(struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { unsigned int udplen = skb->len - dataoff; const struct udphdr *hdr; struct udphdr _hdr; unsigned int cscov; /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (!hdr) { udplite_error_log(skb, state, "short packet"); return true; } cscov = ntohs(hdr->len); if (cscov == 0) { cscov = udplen; } else if (cscov < sizeof(*hdr) || cscov > udplen) { udplite_error_log(skb, state, "invalid checksum coverage"); return true; } /* UDPLITE mandates checksums */ if (!hdr->check) { udplite_error_log(skb, state, "checksum missing"); return true; } /* Checksum invalid? Ignore. */ if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && nf_checksum_partial(skb, state->hook, dataoff, cscov, IPPROTO_UDP, state->pf)) { udplite_error_log(skb, state, "bad checksum"); return true; } return false; } /* Returns verdict for packet, and may modify conntracktype */ int nf_conntrack_udplite_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { unsigned int *timeouts; if (udplite_error(skb, dataoff, state)) return -NF_ACCEPT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); /* If we've seen traffic both ways, this is some kind of UDP stream. Extend timeout. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_REPLIED]); if (unlikely((ct->status & IPS_NAT_CLASH))) return NF_ACCEPT; /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; struct nf_udp_net *un = nf_udp_pernet(net); if (!timeouts) timeouts = un->timeouts; /* set default timeouts for UDP. */ timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED]; timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED]; if (tb[CTA_TIMEOUT_UDP_UNREPLIED]) { timeouts[UDP_CT_UNREPLIED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_UNREPLIED])) * HZ; } if (tb[CTA_TIMEOUT_UDP_REPLIED]) { timeouts[UDP_CT_REPLIED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_REPLIED])) * HZ; } return 0; } static int udp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; if (nla_put_be32(skb, CTA_TIMEOUT_UDP_UNREPLIED, htonl(timeouts[UDP_CT_UNREPLIED] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_UDP_REPLIED, htonl(timeouts[UDP_CT_REPLIED] / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = { [CTA_TIMEOUT_UDP_UNREPLIED] = { .type = NLA_U32 }, [CTA_TIMEOUT_UDP_REPLIED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_udp_init_net(struct net *net) { struct nf_udp_net *un = nf_udp_pernet(net); int i; for (i = 0; i < UDP_CT_MAX; i++) un->timeouts[i] = udp_timeouts[i]; #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) un->offload_timeout = 30 * HZ; #endif } const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp = { .l4proto = IPPROTO_UDP, .allow_clash = true, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = udp_timeout_nlattr_to_obj, .obj_to_nlattr = udp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_UDP_MAX, .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, .nla_policy = udp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; #ifdef CONFIG_NF_CT_PROTO_UDPLITE const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite = { .l4proto = IPPROTO_UDPLITE, .allow_clash = true, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = udp_timeout_nlattr_to_obj, .obj_to_nlattr = udp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_UDP_MAX, .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, .nla_policy = udp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; #endif
284 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 // SPDX-License-Identifier: GPL-2.0 /* * property.c - Unified device property interface. * * Copyright (C) 2014, Intel Corporation * Authors: Rafael J. Wysocki <rafael.j.wysocki@intel.com> * Mika Westerberg <mika.westerberg@linux.intel.com> */ #include <linux/device.h> #include <linux/err.h> #include <linux/export.h> #include <linux/kconfig.h> #include <linux/of.h> #include <linux/property.h> #include <linux/phy.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/types.h> struct fwnode_handle *__dev_fwnode(struct device *dev) { return IS_ENABLED(CONFIG_OF) && dev->of_node ? of_fwnode_handle(dev->of_node) : dev->fwnode; } EXPORT_SYMBOL_GPL(__dev_fwnode); const struct fwnode_handle *__dev_fwnode_const(const struct device *dev) { return IS_ENABLED(CONFIG_OF) && dev->of_node ? of_fwnode_handle(dev->of_node) : dev->fwnode; } EXPORT_SYMBOL_GPL(__dev_fwnode_const); /** * device_property_present - check if a property of a device is present * @dev: Device whose property is being checked * @propname: Name of the property * * Check if property @propname is present in the device firmware description. * * Return: true if property @propname is present. Otherwise, returns false. */ bool device_property_present(const struct device *dev, const char *propname) { return fwnode_property_present(dev_fwnode(dev), propname); } EXPORT_SYMBOL_GPL(device_property_present); /** * fwnode_property_present - check if a property of a firmware node is present * @fwnode: Firmware node whose property to check * @propname: Name of the property * * Return: true if property @propname is present. Otherwise, returns false. */ bool fwnode_property_present(const struct fwnode_handle *fwnode, const char *propname) { bool ret; if (IS_ERR_OR_NULL(fwnode)) return false; ret = fwnode_call_bool_op(fwnode, property_present, propname); if (ret) return ret; return fwnode_call_bool_op(fwnode->secondary, property_present, propname); } EXPORT_SYMBOL_GPL(fwnode_property_present); /** * device_property_read_u8_array - return a u8 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u8 properties with @propname from the device * firmware description and stores them to @val if found. * * It's recommended to call device_property_count_u8() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u8_array(const struct device *dev, const char *propname, u8 *val, size_t nval) { return fwnode_property_read_u8_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u8_array); /** * device_property_read_u16_array - return a u16 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u16 properties with @propname from the device * firmware description and stores them to @val if found. * * It's recommended to call device_property_count_u16() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u16_array(const struct device *dev, const char *propname, u16 *val, size_t nval) { return fwnode_property_read_u16_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u16_array); /** * device_property_read_u32_array - return a u32 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u32 properties with @propname from the device * firmware description and stores them to @val if found. * * It's recommended to call device_property_count_u32() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u32_array(const struct device *dev, const char *propname, u32 *val, size_t nval) { return fwnode_property_read_u32_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u32_array); /** * device_property_read_u64_array - return a u64 array property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of u64 properties with @propname from the device * firmware description and stores them to @val if found. * * It's recommended to call device_property_count_u64() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_u64_array(const struct device *dev, const char *propname, u64 *val, size_t nval) { return fwnode_property_read_u64_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_u64_array); /** * device_property_read_string_array - return a string array property of device * @dev: Device to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Function reads an array of string properties with @propname from the device * firmware description and stores them to @val if found. * * It's recommended to call device_property_string_array_count() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values read on success if @val is non-NULL, * number of values available on success if @val is NULL, * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property is not an array of strings, * %-EOVERFLOW if the size of the property is not as expected. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_string_array(const struct device *dev, const char *propname, const char **val, size_t nval) { return fwnode_property_read_string_array(dev_fwnode(dev), propname, val, nval); } EXPORT_SYMBOL_GPL(device_property_read_string_array); /** * device_property_read_string - return a string property of a device * @dev: Device to get the property of * @propname: Name of the property * @val: The value is stored here * * Function reads property @propname from the device firmware description and * stores the value into @val if found. The value is checked to be a string. * * Return: %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property type is not a string. * %-ENXIO if no suitable firmware interface is present. */ int device_property_read_string(const struct device *dev, const char *propname, const char **val) { return fwnode_property_read_string(dev_fwnode(dev), propname, val); } EXPORT_SYMBOL_GPL(device_property_read_string); /** * device_property_match_string - find a string in an array and return index * @dev: Device to get the property of * @propname: Name of the property holding the array * @string: String to look for * * Find a given string in a string array and if it is found return the * index back. * * Return: index, starting from %0, if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of strings, * %-ENXIO if no suitable firmware interface is present. */ int device_property_match_string(const struct device *dev, const char *propname, const char *string) { return fwnode_property_match_string(dev_fwnode(dev), propname, string); } EXPORT_SYMBOL_GPL(device_property_match_string); static int fwnode_property_read_int_array(const struct fwnode_handle *fwnode, const char *propname, unsigned int elem_size, void *val, size_t nval) { int ret; if (IS_ERR_OR_NULL(fwnode)) return -EINVAL; ret = fwnode_call_int_op(fwnode, property_read_int_array, propname, elem_size, val, nval); if (ret != -EINVAL) return ret; return fwnode_call_int_op(fwnode->secondary, property_read_int_array, propname, elem_size, val, nval); } /** * fwnode_property_read_u8_array - return a u8 array property of firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u8 properties with @propname from @fwnode and stores them to * @val if found. * * It's recommended to call fwnode_property_count_u8() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u8_array(const struct fwnode_handle *fwnode, const char *propname, u8 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u8), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u8_array); /** * fwnode_property_read_u16_array - return a u16 array property of firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u16 properties with @propname from @fwnode and store them to * @val if found. * * It's recommended to call fwnode_property_count_u16() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u16_array(const struct fwnode_handle *fwnode, const char *propname, u16 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u16), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u16_array); /** * fwnode_property_read_u32_array - return a u32 array property of firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u32 properties with @propname from @fwnode store them to * @val if found. * * It's recommended to call fwnode_property_count_u32() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u32_array(const struct fwnode_handle *fwnode, const char *propname, u32 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u32), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u32_array); /** * fwnode_property_read_u64_array - return a u64 array property firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an array of u64 properties with @propname from @fwnode and store them to * @val if found. * * It's recommended to call fwnode_property_count_u64() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values if @val was %NULL, * %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of numbers, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_u64_array(const struct fwnode_handle *fwnode, const char *propname, u64 *val, size_t nval) { return fwnode_property_read_int_array(fwnode, propname, sizeof(u64), val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_u64_array); /** * fwnode_property_read_string_array - return string array property of a node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The values are stored here or %NULL to return the number of values * @nval: Size of the @val array * * Read an string list property @propname from the given firmware node and store * them to @val if found. * * It's recommended to call fwnode_property_string_array_count() instead of calling * this function with @val equals %NULL and @nval equals 0. * * Return: number of values read on success if @val is non-NULL, * number of values available on success if @val is NULL, * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property is not an array of strings, * %-EOVERFLOW if the size of the property is not as expected, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_string_array(const struct fwnode_handle *fwnode, const char *propname, const char **val, size_t nval) { int ret; if (IS_ERR_OR_NULL(fwnode)) return -EINVAL; ret = fwnode_call_int_op(fwnode, property_read_string_array, propname, val, nval); if (ret != -EINVAL) return ret; return fwnode_call_int_op(fwnode->secondary, property_read_string_array, propname, val, nval); } EXPORT_SYMBOL_GPL(fwnode_property_read_string_array); /** * fwnode_property_read_string - return a string property of a firmware node * @fwnode: Firmware node to get the property of * @propname: Name of the property * @val: The value is stored here * * Read property @propname from the given firmware node and store the value into * @val if found. The value is checked to be a string. * * Return: %0 if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property is not a string, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_read_string(const struct fwnode_handle *fwnode, const char *propname, const char **val) { int ret = fwnode_property_read_string_array(fwnode, propname, val, 1); return ret < 0 ? ret : 0; } EXPORT_SYMBOL_GPL(fwnode_property_read_string); /** * fwnode_property_match_string - find a string in an array and return index * @fwnode: Firmware node to get the property of * @propname: Name of the property holding the array * @string: String to look for * * Find a given string in a string array and if it is found return the * index back. * * Return: index, starting from %0, if the property was found (success), * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO if the property is not an array of strings, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_match_string(const struct fwnode_handle *fwnode, const char *propname, const char *string) { const char **values; int nval, ret; nval = fwnode_property_string_array_count(fwnode, propname); if (nval < 0) return nval; if (nval == 0) return -ENODATA; values = kcalloc(nval, sizeof(*values), GFP_KERNEL); if (!values) return -ENOMEM; ret = fwnode_property_read_string_array(fwnode, propname, values, nval); if (ret < 0) goto out_free; ret = match_string(values, nval, string); if (ret < 0) ret = -ENODATA; out_free: kfree(values); return ret; } EXPORT_SYMBOL_GPL(fwnode_property_match_string); /** * fwnode_property_match_property_string - find a property string value in an array and return index * @fwnode: Firmware node to get the property of * @propname: Name of the property holding the string value * @array: String array to search in * @n: Size of the @array * * Find a property string value in a given @array and if it is found return * the index back. * * Return: index, starting from %0, if the string value was found in the @array (success), * %-ENOENT when the string value was not found in the @array, * %-EINVAL if given arguments are not valid, * %-ENODATA if the property does not have a value, * %-EPROTO or %-EILSEQ if the property is not a string, * %-ENXIO if no suitable firmware interface is present. */ int fwnode_property_match_property_string(const struct fwnode_handle *fwnode, const char *propname, const char * const *array, size_t n) { const char *string; int ret; ret = fwnode_property_read_string(fwnode, propname, &string); if (ret) return ret; ret = match_string(array, n, string); if (ret < 0) ret = -ENOENT; return ret; } EXPORT_SYMBOL_GPL(fwnode_property_match_property_string); /** * fwnode_property_get_reference_args() - Find a reference with arguments * @fwnode: Firmware node where to look for the reference * @prop: The name of the property * @nargs_prop: The name of the property telling the number of * arguments in the referred node. NULL if @nargs is known, * otherwise @nargs is ignored. Only relevant on OF. * @nargs: Number of arguments. Ignored if @nargs_prop is non-NULL. * @index: Index of the reference, from zero onwards. * @args: Result structure with reference and integer arguments. * May be NULL. * * Obtain a reference based on a named property in an fwnode, with * integer arguments. * * The caller is responsible for calling fwnode_handle_put() on the returned * @args->fwnode pointer. * * Return: %0 on success * %-ENOENT when the index is out of bounds, the index has an empty * reference or the property was not found * %-EINVAL on parse error */ int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode, const char *prop, const char *nargs_prop, unsigned int nargs, unsigned int index, struct fwnode_reference_args *args) { int ret; if (IS_ERR_OR_NULL(fwnode)) return -ENOENT; ret = fwnode_call_int_op(fwnode, get_reference_args, prop, nargs_prop, nargs, index, args); if (ret == 0) return ret; if (IS_ERR_OR_NULL(fwnode->secondary)) return ret; return fwnode_call_int_op(fwnode->secondary, get_reference_args, prop, nargs_prop, nargs, index, args); } EXPORT_SYMBOL_GPL(fwnode_property_get_reference_args); /** * fwnode_find_reference - Find named reference to a fwnode_handle * @fwnode: Firmware node where to look for the reference * @name: The name of the reference * @index: Index of the reference * * @index can be used when the named reference holds a table of references. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: a pointer to the reference fwnode, when found. Otherwise, * returns an error pointer. */ struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode, const char *name, unsigned int index) { struct fwnode_reference_args args; int ret; ret = fwnode_property_get_reference_args(fwnode, name, NULL, 0, index, &args); return ret ? ERR_PTR(ret) : args.fwnode; } EXPORT_SYMBOL_GPL(fwnode_find_reference); /** * fwnode_get_name - Return the name of a node * @fwnode: The firmware node * * Return: a pointer to the node name, or %NULL. */ const char *fwnode_get_name(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, get_name); } EXPORT_SYMBOL_GPL(fwnode_get_name); /** * fwnode_get_name_prefix - Return the prefix of node for printing purposes * @fwnode: The firmware node * * Return: the prefix of a node, intended to be printed right before the node. * The prefix works also as a separator between the nodes. */ const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, get_name_prefix); } /** * fwnode_name_eq - Return true if node name is equal * @fwnode: The firmware node * @name: The name to which to compare the node name * * Compare the name provided as an argument to the name of the node, stopping * the comparison at either NUL or '@' character, whichever comes first. This * function is generally used for comparing node names while ignoring the * possible unit address of the node. * * Return: true if the node name matches with the name provided in the @name * argument, false otherwise. */ bool fwnode_name_eq(const struct fwnode_handle *fwnode, const char *name) { const char *node_name; ptrdiff_t len; node_name = fwnode_get_name(fwnode); if (!node_name) return false; len = strchrnul(node_name, '@') - node_name; return str_has_prefix(node_name, name) == len; } EXPORT_SYMBOL_GPL(fwnode_name_eq); /** * fwnode_get_parent - Return parent firwmare node * @fwnode: Firmware whose parent is retrieved * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: parent firmware node of the given node if possible or %NULL if no * parent was available. */ struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, get_parent); } EXPORT_SYMBOL_GPL(fwnode_get_parent); /** * fwnode_get_next_parent - Iterate to the node's parent * @fwnode: Firmware whose parent is retrieved * * This is like fwnode_get_parent() except that it drops the refcount * on the passed node, making it suitable for iterating through a * node's parents. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. Note that this function also puts a reference to @fwnode * unconditionally. * * Return: parent firmware node of the given node if possible or %NULL if no * parent was available. */ struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode) { struct fwnode_handle *parent = fwnode_get_parent(fwnode); fwnode_handle_put(fwnode); return parent; } EXPORT_SYMBOL_GPL(fwnode_get_next_parent); /** * fwnode_count_parents - Return the number of parents a node has * @fwnode: The node the parents of which are to be counted * * Return: the number of parents a node has. */ unsigned int fwnode_count_parents(const struct fwnode_handle *fwnode) { struct fwnode_handle *parent; unsigned int count = 0; fwnode_for_each_parent_node(fwnode, parent) count++; return count; } EXPORT_SYMBOL_GPL(fwnode_count_parents); /** * fwnode_get_nth_parent - Return an nth parent of a node * @fwnode: The node the parent of which is requested * @depth: Distance of the parent from the node * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: the nth parent of a node. If there is no parent at the requested * @depth, %NULL is returned. If @depth is 0, the functionality is equivalent to * fwnode_handle_get(). For @depth == 1, it is fwnode_get_parent() and so on. */ struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwnode, unsigned int depth) { struct fwnode_handle *parent; if (depth == 0) return fwnode_handle_get(fwnode); fwnode_for_each_parent_node(fwnode, parent) { if (--depth == 0) return parent; } return NULL; } EXPORT_SYMBOL_GPL(fwnode_get_nth_parent); /** * fwnode_get_next_child_node - Return the next child node handle for a node * @fwnode: Firmware node to find the next child node for. * @child: Handle to one of the node's child nodes or a %NULL handle. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. Note that this function also puts a reference to @child * unconditionally. */ struct fwnode_handle * fwnode_get_next_child_node(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { return fwnode_call_ptr_op(fwnode, get_next_child_node, child); } EXPORT_SYMBOL_GPL(fwnode_get_next_child_node); /** * fwnode_get_next_available_child_node - Return the next available child node handle for a node * @fwnode: Firmware node to find the next child node for. * @child: Handle to one of the node's child nodes or a %NULL handle. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. Note that this function also puts a reference to @child * unconditionally. */ struct fwnode_handle * fwnode_get_next_available_child_node(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { struct fwnode_handle *next_child = child; if (IS_ERR_OR_NULL(fwnode)) return NULL; do { next_child = fwnode_get_next_child_node(fwnode, next_child); if (!next_child) return NULL; } while (!fwnode_device_is_available(next_child)); return next_child; } EXPORT_SYMBOL_GPL(fwnode_get_next_available_child_node); /** * device_get_next_child_node - Return the next child node handle for a device * @dev: Device to find the next child node for. * @child: Handle to one of the device's child nodes or a %NULL handle. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. Note that this function also puts a reference to @child * unconditionally. */ struct fwnode_handle *device_get_next_child_node(const struct device *dev, struct fwnode_handle *child) { const struct fwnode_handle *fwnode = dev_fwnode(dev); struct fwnode_handle *next; if (IS_ERR_OR_NULL(fwnode)) return NULL; /* Try to find a child in primary fwnode */ next = fwnode_get_next_child_node(fwnode, child); if (next) return next; /* When no more children in primary, continue with secondary */ return fwnode_get_next_child_node(fwnode->secondary, child); } EXPORT_SYMBOL_GPL(device_get_next_child_node); /** * fwnode_get_named_child_node - Return first matching named child node handle * @fwnode: Firmware node to find the named child node for. * @childname: String to match child node name against. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. */ struct fwnode_handle * fwnode_get_named_child_node(const struct fwnode_handle *fwnode, const char *childname) { return fwnode_call_ptr_op(fwnode, get_named_child_node, childname); } EXPORT_SYMBOL_GPL(fwnode_get_named_child_node); /** * device_get_named_child_node - Return first matching named child node handle * @dev: Device to find the named child node for. * @childname: String to match child node name against. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. */ struct fwnode_handle *device_get_named_child_node(const struct device *dev, const char *childname) { return fwnode_get_named_child_node(dev_fwnode(dev), childname); } EXPORT_SYMBOL_GPL(device_get_named_child_node); /** * fwnode_handle_get - Obtain a reference to a device node * @fwnode: Pointer to the device node to obtain the reference to. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: the fwnode handle. */ struct fwnode_handle *fwnode_handle_get(struct fwnode_handle *fwnode) { if (!fwnode_has_op(fwnode, get)) return fwnode; return fwnode_call_ptr_op(fwnode, get); } EXPORT_SYMBOL_GPL(fwnode_handle_get); /** * fwnode_device_is_available - check if a device is available for use * @fwnode: Pointer to the fwnode of the device. * * Return: true if device is available for use. Otherwise, returns false. * * For fwnode node types that don't implement the .device_is_available() * operation, this function returns true. */ bool fwnode_device_is_available(const struct fwnode_handle *fwnode) { if (IS_ERR_OR_NULL(fwnode)) return false; if (!fwnode_has_op(fwnode, device_is_available)) return true; return fwnode_call_bool_op(fwnode, device_is_available); } EXPORT_SYMBOL_GPL(fwnode_device_is_available); /** * device_get_child_node_count - return the number of child nodes for device * @dev: Device to count the child nodes for * * Return: the number of child nodes for a given device. */ unsigned int device_get_child_node_count(const struct device *dev) { struct fwnode_handle *child; unsigned int count = 0; device_for_each_child_node(dev, child) count++; return count; } EXPORT_SYMBOL_GPL(device_get_child_node_count); bool device_dma_supported(const struct device *dev) { return fwnode_call_bool_op(dev_fwnode(dev), device_dma_supported); } EXPORT_SYMBOL_GPL(device_dma_supported); enum dev_dma_attr device_get_dma_attr(const struct device *dev) { if (!fwnode_has_op(dev_fwnode(dev), device_get_dma_attr)) return DEV_DMA_NOT_SUPPORTED; return fwnode_call_int_op(dev_fwnode(dev), device_get_dma_attr); } EXPORT_SYMBOL_GPL(device_get_dma_attr); /** * fwnode_get_phy_mode - Get phy mode for given firmware node * @fwnode: Pointer to the given node * * The function gets phy interface string from property 'phy-mode' or * 'phy-connection-type', and return its index in phy_modes table, or errno in * error case. */ int fwnode_get_phy_mode(const struct fwnode_handle *fwnode) { const char *pm; int err, i; err = fwnode_property_read_string(fwnode, "phy-mode", &pm); if (err < 0) err = fwnode_property_read_string(fwnode, "phy-connection-type", &pm); if (err < 0) return err; for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++) if (!strcasecmp(pm, phy_modes(i))) return i; return -ENODEV; } EXPORT_SYMBOL_GPL(fwnode_get_phy_mode); /** * device_get_phy_mode - Get phy mode for given device * @dev: Pointer to the given device * * The function gets phy interface string from property 'phy-mode' or * 'phy-connection-type', and return its index in phy_modes table, or errno in * error case. */ int device_get_phy_mode(struct device *dev) { return fwnode_get_phy_mode(dev_fwnode(dev)); } EXPORT_SYMBOL_GPL(device_get_phy_mode); /** * fwnode_iomap - Maps the memory mapped IO for a given fwnode * @fwnode: Pointer to the firmware node * @index: Index of the IO range * * Return: a pointer to the mapped memory. */ void __iomem *fwnode_iomap(struct fwnode_handle *fwnode, int index) { return fwnode_call_ptr_op(fwnode, iomap, index); } EXPORT_SYMBOL(fwnode_iomap); /** * fwnode_irq_get - Get IRQ directly from a fwnode * @fwnode: Pointer to the firmware node * @index: Zero-based index of the IRQ * * Return: Linux IRQ number on success. Negative errno on failure. */ int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index) { int ret; ret = fwnode_call_int_op(fwnode, irq_get, index); /* We treat mapping errors as invalid case */ if (ret == 0) return -EINVAL; return ret; } EXPORT_SYMBOL(fwnode_irq_get); /** * fwnode_irq_get_byname - Get IRQ from a fwnode using its name * @fwnode: Pointer to the firmware node * @name: IRQ name * * Description: * Find a match to the string @name in the 'interrupt-names' string array * in _DSD for ACPI, or of_node for Device Tree. Then get the Linux IRQ * number of the IRQ resource corresponding to the index of the matched * string. * * Return: Linux IRQ number on success, or negative errno otherwise. */ int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name) { int index; if (!name) return -EINVAL; index = fwnode_property_match_string(fwnode, "interrupt-names", name); if (index < 0) return index; return fwnode_irq_get(fwnode, index); } EXPORT_SYMBOL(fwnode_irq_get_byname); /** * fwnode_graph_get_next_endpoint - Get next endpoint firmware node * @fwnode: Pointer to the parent firmware node * @prev: Previous endpoint node or %NULL to get the first * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. Note that this function also puts a reference to @prev * unconditionally. * * Return: an endpoint firmware node pointer or %NULL if no more endpoints * are available. */ struct fwnode_handle * fwnode_graph_get_next_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle *prev) { struct fwnode_handle *ep, *port_parent = NULL; const struct fwnode_handle *parent; /* * If this function is in a loop and the previous iteration returned * an endpoint from fwnode->secondary, then we need to use the secondary * as parent rather than @fwnode. */ if (prev) { port_parent = fwnode_graph_get_port_parent(prev); parent = port_parent; } else { parent = fwnode; } if (IS_ERR_OR_NULL(parent)) return NULL; ep = fwnode_call_ptr_op(parent, graph_get_next_endpoint, prev); if (ep) goto out_put_port_parent; ep = fwnode_graph_get_next_endpoint(parent->secondary, NULL); out_put_port_parent: fwnode_handle_put(port_parent); return ep; } EXPORT_SYMBOL_GPL(fwnode_graph_get_next_endpoint); /** * fwnode_graph_get_port_parent - Return the device fwnode of a port endpoint * @endpoint: Endpoint firmware node of the port * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: the firmware node of the device the @endpoint belongs to. */ struct fwnode_handle * fwnode_graph_get_port_parent(const struct fwnode_handle *endpoint) { struct fwnode_handle *port, *parent; port = fwnode_get_parent(endpoint); parent = fwnode_call_ptr_op(port, graph_get_port_parent); fwnode_handle_put(port); return parent; } EXPORT_SYMBOL_GPL(fwnode_graph_get_port_parent); /** * fwnode_graph_get_remote_port_parent - Return fwnode of a remote device * @fwnode: Endpoint firmware node pointing to the remote endpoint * * Extracts firmware node of a remote device the @fwnode points to. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. */ struct fwnode_handle * fwnode_graph_get_remote_port_parent(const struct fwnode_handle *fwnode) { struct fwnode_handle *endpoint, *parent; endpoint = fwnode_graph_get_remote_endpoint(fwnode); parent = fwnode_graph_get_port_parent(endpoint); fwnode_handle_put(endpoint); return parent; } EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port_parent); /** * fwnode_graph_get_remote_port - Return fwnode of a remote port * @fwnode: Endpoint firmware node pointing to the remote endpoint * * Extracts firmware node of a remote port the @fwnode points to. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. */ struct fwnode_handle * fwnode_graph_get_remote_port(const struct fwnode_handle *fwnode) { return fwnode_get_next_parent(fwnode_graph_get_remote_endpoint(fwnode)); } EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port); /** * fwnode_graph_get_remote_endpoint - Return fwnode of a remote endpoint * @fwnode: Endpoint firmware node pointing to the remote endpoint * * Extracts firmware node of a remote endpoint the @fwnode points to. * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. */ struct fwnode_handle * fwnode_graph_get_remote_endpoint(const struct fwnode_handle *fwnode) { return fwnode_call_ptr_op(fwnode, graph_get_remote_endpoint); } EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_endpoint); static bool fwnode_graph_remote_available(struct fwnode_handle *ep) { struct fwnode_handle *dev_node; bool available; dev_node = fwnode_graph_get_remote_port_parent(ep); available = fwnode_device_is_available(dev_node); fwnode_handle_put(dev_node); return available; } /** * fwnode_graph_get_endpoint_by_id - get endpoint by port and endpoint numbers * @fwnode: parent fwnode_handle containing the graph * @port: identifier of the port node * @endpoint: identifier of the endpoint node under the port node * @flags: fwnode lookup flags * * The caller is responsible for calling fwnode_handle_put() on the returned * fwnode pointer. * * Return: the fwnode handle of the local endpoint corresponding the port and * endpoint IDs or %NULL if not found. * * If FWNODE_GRAPH_ENDPOINT_NEXT is passed in @flags and the specified endpoint * has not been found, look for the closest endpoint ID greater than the * specified one and return the endpoint that corresponds to it, if present. * * Does not return endpoints that belong to disabled devices or endpoints that * are unconnected, unless FWNODE_GRAPH_DEVICE_DISABLED is passed in @flags. */ struct fwnode_handle * fwnode_graph_get_endpoint_by_id(const struct fwnode_handle *fwnode, u32 port, u32 endpoint, unsigned long flags) { struct fwnode_handle *ep, *best_ep = NULL; unsigned int best_ep_id = 0; bool endpoint_next = flags & FWNODE_GRAPH_ENDPOINT_NEXT; bool enabled_only = !(flags & FWNODE_GRAPH_DEVICE_DISABLED); fwnode_graph_for_each_endpoint(fwnode, ep) { struct fwnode_endpoint fwnode_ep = { 0 }; int ret; if (enabled_only && !fwnode_graph_remote_available(ep)) continue; ret = fwnode_graph_parse_endpoint(ep, &fwnode_ep); if (ret < 0) continue; if (fwnode_ep.port != port) continue; if (fwnode_ep.id == endpoint) return ep; if (!endpoint_next) continue; /* * If the endpoint that has just been found is not the first * matching one and the ID of the one found previously is closer * to the requested endpoint ID, skip it. */ if (fwnode_ep.id < endpoint || (best_ep && best_ep_id < fwnode_ep.id)) continue; fwnode_handle_put(best_ep); best_ep = fwnode_handle_get(ep); best_ep_id = fwnode_ep.id; } return best_ep; } EXPORT_SYMBOL_GPL(fwnode_graph_get_endpoint_by_id); /** * fwnode_graph_get_endpoint_count - Count endpoints on a device node * @fwnode: The node related to a device * @flags: fwnode lookup flags * Count endpoints in a device node. * * If FWNODE_GRAPH_DEVICE_DISABLED flag is specified, also unconnected endpoints * and endpoints connected to disabled devices are counted. */ unsigned int fwnode_graph_get_endpoint_count(const struct fwnode_handle *fwnode, unsigned long flags) { struct fwnode_handle *ep; unsigned int count = 0; fwnode_graph_for_each_endpoint(fwnode, ep) { if (flags & FWNODE_GRAPH_DEVICE_DISABLED || fwnode_graph_remote_available(ep)) count++; } return count; } EXPORT_SYMBOL_GPL(fwnode_graph_get_endpoint_count); /** * fwnode_graph_parse_endpoint - parse common endpoint node properties * @fwnode: pointer to endpoint fwnode_handle * @endpoint: pointer to the fwnode endpoint data structure * * Parse @fwnode representing a graph endpoint node and store the * information in @endpoint. The caller must hold a reference to * @fwnode. */ int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode, struct fwnode_endpoint *endpoint) { memset(endpoint, 0, sizeof(*endpoint)); return fwnode_call_int_op(fwnode, graph_parse_endpoint, endpoint); } EXPORT_SYMBOL(fwnode_graph_parse_endpoint); const void *device_get_match_data(const struct device *dev) { return fwnode_call_ptr_op(dev_fwnode(dev), device_get_match_data, dev); } EXPORT_SYMBOL_GPL(device_get_match_data); static unsigned int fwnode_graph_devcon_matches(const struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match, void **matches, unsigned int matches_len) { struct fwnode_handle *node; struct fwnode_handle *ep; unsigned int count = 0; void *ret; fwnode_graph_for_each_endpoint(fwnode, ep) { if (matches && count >= matches_len) { fwnode_handle_put(ep); break; } node = fwnode_graph_get_remote_port_parent(ep); if (!fwnode_device_is_available(node)) { fwnode_handle_put(node); continue; } ret = match(node, con_id, data); fwnode_handle_put(node); if (ret) { if (matches) matches[count] = ret; count++; } } return count; } static unsigned int fwnode_devcon_matches(const struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match, void **matches, unsigned int matches_len) { struct fwnode_handle *node; unsigned int count = 0; unsigned int i; void *ret; for (i = 0; ; i++) { if (matches && count >= matches_len) break; node = fwnode_find_reference(fwnode, con_id, i); if (IS_ERR(node)) break; ret = match(node, NULL, data); fwnode_handle_put(node); if (ret) { if (matches) matches[count] = ret; count++; } } return count; } /** * fwnode_connection_find_match - Find connection from a device node * @fwnode: Device node with the connection * @con_id: Identifier for the connection * @data: Data for the match function * @match: Function to check and convert the connection description * * Find a connection with unique identifier @con_id between @fwnode and another * device node. @match will be used to convert the connection description to * data the caller is expecting to be returned. */ void *fwnode_connection_find_match(const struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match) { unsigned int count; void *ret; if (!fwnode || !match) return NULL; count = fwnode_graph_devcon_matches(fwnode, con_id, data, match, &ret, 1); if (count) return ret; count = fwnode_devcon_matches(fwnode, con_id, data, match, &ret, 1); return count ? ret : NULL; } EXPORT_SYMBOL_GPL(fwnode_connection_find_match); /** * fwnode_connection_find_matches - Find connections from a device node * @fwnode: Device node with the connection * @con_id: Identifier for the connection * @data: Data for the match function * @match: Function to check and convert the connection description * @matches: (Optional) array of pointers to fill with matches * @matches_len: Length of @matches * * Find up to @matches_len connections with unique identifier @con_id between * @fwnode and other device nodes. @match will be used to convert the * connection description to data the caller is expecting to be returned * through the @matches array. * * If @matches is %NULL @matches_len is ignored and the total number of resolved * matches is returned. * * Return: Number of matches resolved, or negative errno. */ int fwnode_connection_find_matches(const struct fwnode_handle *fwnode, const char *con_id, void *data, devcon_match_fn_t match, void **matches, unsigned int matches_len) { unsigned int count_graph; unsigned int count_ref; if (!fwnode || !match) return -EINVAL; count_graph = fwnode_graph_devcon_matches(fwnode, con_id, data, match, matches, matches_len); if (matches) { matches += count_graph; matches_len -= count_graph; } count_ref = fwnode_devcon_matches(fwnode, con_id, data, match, matches, matches_len); return count_graph + count_ref; } EXPORT_SYMBOL_GPL(fwnode_connection_find_matches);
45 10 10 10 10 50 6 40 40 40 39 39 45 50 50 46 23 23 23 23 39 39 38 39 26 26 26 26 12 12 5 12 12 26 30 58 18 17 2 2 2 18 76 76 53 77 76 14 14 1 15 15 15 15 15 14 14 14 14 15 15 9 6 15 15 13 3 3 3 6 6 5 5 1 1 1 1 3 3 1 2 2 2 1 1 1 1 1 1 3 3 3 14 3 3 1 2 1 14 14 12 12 12 2 2 14 1 1 1 1 1 15 15 15 15 2 1 2 2 1 1 2 15 15 15 15 15 12 3 15 15 15 15 25 25 26 26 2 26 13 13 9 13 87 60 61 61 59 61 10 10 10 10 9 9 9 10 1 15 1 10 1 9 15 15 15 12 12 12 12 12 11 12 9 2 1 10 1 15 15 15 15 3 15 6 9 15 9 15 15 15 15 9 14 15 3 3 3 1 1 1 1 1 4 2 2 1 1 1 1 1 1 2 1 1 1 1 2 4 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 2 1 2 2 1 2 1 1 1 2 2 8 8 8 8 8 7 7 6 7 4 5 6 4 8 7 8 7 1 1 7 39 38 39 39 39 39 43 3 4 3 43 42 42 43 1 1 43 43 78 43 2 2 2 2 42 42 39 39 4 35 42 42 35 76 76 76 76 76 76 75 75 76 42 42 42 35 75 76 76 76 76 11 11 11 11 11 10 10 10 10 11 27 27 27 27 26 11 12 11 11 11 12 12 1 1 1 1 12 12 12 12 12 12 12 1 1 32 32 6 32 30 3 3 1 31 4 4 4 4 4 30 27 31 26 26 32 32 32 32 32 6 6 26 22 22 22 22 22 22 48 17 48 32 32 32 32 32 32 32 26 1 32 22 21 37 37 37 22 22 22 22 22 22 21 21 396 339 339 11 11 10 10 4 4 6 6 390 22 391 269 387 37 22 22 22 22 22 6 6 6 6 6 6 6 390 397 388 185 388 47 4 28 28 59 21 382 21 382 4 2 26 27 4 4 27 58 25 58 37 7 15 58 84 84 27 58 84 84 61 48 11 38 39 39 38 39 39 39 39 38 39 16 16 39 39 27 11 16 11 13 13 13 13 13 2 12 13 2 13 13 2 2 11 1 11 11 2 4 4 4 4 4 4 4 4 4 4 4 4 4 15 15 2 2 1 1 1 1 1 1 3 1 3 2 2 2 2 2 2 3 3 1 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 // SPDX-License-Identifier: GPL-2.0-only /* * xfrm_policy.c * * Changes: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support * Kazunori MIYAZAWA @USAGI * YOSHIFUJI Hideaki * Split up af-specific portion * Derek Atkins <derek@ihtfp.com> Add the post_input processor * */ #include <linux/err.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/cache.h> #include <linux/cpu.h> #include <linux/audit.h> #include <linux/rhashtable.h> #include <linux/if_tunnel.h> #include <linux/icmp.h> #include <net/dst.h> #include <net/flow.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/gre.h> #if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/mip6.h> #endif #ifdef CONFIG_XFRM_STATISTICS #include <net/snmp.h> #endif #ifdef CONFIG_XFRM_ESPINTCP #include <net/espintcp.h> #endif #include "xfrm_hash.h" #define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10)) #define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ)) #define XFRM_MAX_QUEUE_LEN 100 struct xfrm_flo { struct dst_entry *dst_orig; u8 flags; }; /* prefixes smaller than this are stored in lists, not trees. */ #define INEXACT_PREFIXLEN_IPV4 16 #define INEXACT_PREFIXLEN_IPV6 48 struct xfrm_pol_inexact_node { struct rb_node node; union { xfrm_address_t addr; struct rcu_head rcu; }; u8 prefixlen; struct rb_root root; /* the policies matching this node, can be empty list */ struct hlist_head hhead; }; /* xfrm inexact policy search tree: * xfrm_pol_inexact_bin = hash(dir,type,family,if_id); * | * +---- root_d: sorted by daddr:prefix * | | * | xfrm_pol_inexact_node * | | * | +- root: sorted by saddr/prefix * | | | * | | xfrm_pol_inexact_node * | | | * | | + root: unused * | | | * | | + hhead: saddr:daddr policies * | | * | +- coarse policies and all any:daddr policies * | * +---- root_s: sorted by saddr:prefix * | | * | xfrm_pol_inexact_node * | | * | + root: unused * | | * | + hhead: saddr:any policies * | * +---- coarse policies and all any:any policies * * Lookups return four candidate lists: * 1. any:any list from top-level xfrm_pol_inexact_bin * 2. any:daddr list from daddr tree * 3. saddr:daddr list from 2nd level daddr tree * 4. saddr:any list from saddr tree * * This result set then needs to be searched for the policy with * the lowest priority. If two results have same prio, youngest one wins. */ struct xfrm_pol_inexact_key { possible_net_t net; u32 if_id; u16 family; u8 dir, type; }; struct xfrm_pol_inexact_bin { struct xfrm_pol_inexact_key k; struct rhash_head head; /* list containing '*:*' policies */ struct hlist_head hhead; seqcount_spinlock_t count; /* tree sorted by daddr/prefix */ struct rb_root root_d; /* tree sorted by saddr/prefix */ struct rb_root root_s; /* slow path below */ struct list_head inexact_bins; struct rcu_head rcu; }; enum xfrm_pol_inexact_candidate_type { XFRM_POL_CAND_BOTH, XFRM_POL_CAND_SADDR, XFRM_POL_CAND_DADDR, XFRM_POL_CAND_ANY, XFRM_POL_CAND_MAX, }; struct xfrm_pol_inexact_candidates { struct hlist_head *res[XFRM_POL_CAND_MAX]; }; struct xfrm_flow_keys { struct flow_dissector_key_basic basic; struct flow_dissector_key_control control; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; } addrs; struct flow_dissector_key_ip ip; struct flow_dissector_key_icmp icmp; struct flow_dissector_key_ports ports; struct flow_dissector_key_keyid gre; }; static struct flow_dissector xfrm_session_dissector __ro_after_init; static DEFINE_SPINLOCK(xfrm_if_cb_lock); static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; static struct kmem_cache *xfrm_dst_cache __ro_after_init; static struct rhashtable xfrm_policy_inexact_table; static const struct rhashtable_params xfrm_pol_inexact_params; static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); static void xfrm_policy_queue_process(struct timer_list *t); static void __xfrm_policy_link(struct xfrm_policy *pol, int dir); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir, u32 if_id); static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir, u32 if_id); static struct xfrm_policy * xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl); static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, struct xfrm_policy *policy); static bool xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_pol_inexact_bin *b, const xfrm_address_t *saddr, const xfrm_address_t *daddr); static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy) { return refcount_inc_not_zero(&policy->refcnt); } static inline bool __xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi4 *fl4 = &fl->u.ip4; return addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) && addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) && !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) && !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) && (fl4->flowi4_proto == sel->proto || !sel->proto) && (fl4->flowi4_oif == sel->ifindex || !sel->ifindex); } static inline bool __xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi6 *fl6 = &fl->u.ip6; return addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) && addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) && !((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) && !((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) && (fl6->flowi6_proto == sel->proto || !sel->proto) && (fl6->flowi6_oif == sel->ifindex || !sel->ifindex); } bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl, unsigned short family) { switch (family) { case AF_INET: return __xfrm4_selector_match(sel, fl); case AF_INET6: return __xfrm6_selector_match(sel, fl); } return false; } static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) { const struct xfrm_policy_afinfo *afinfo; if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo))) return NULL; rcu_read_lock(); afinfo = rcu_dereference(xfrm_policy_afinfo[family]); if (unlikely(!afinfo)) rcu_read_unlock(); return afinfo; } /* Called with rcu_read_lock(). */ static const struct xfrm_if_cb *xfrm_if_get_cb(void) { return rcu_dereference(xfrm_if_cb); } struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, int family, u32 mark) { const struct xfrm_policy_afinfo *afinfo; struct dst_entry *dst; afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return ERR_PTR(-EAFNOSUPPORT); dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark); rcu_read_unlock(); return dst; } EXPORT_SYMBOL(__xfrm_dst_lookup); static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, int oif, xfrm_address_t *prev_saddr, xfrm_address_t *prev_daddr, int family, u32 mark) { struct net *net = xs_net(x); xfrm_address_t *saddr = &x->props.saddr; xfrm_address_t *daddr = &x->id.daddr; struct dst_entry *dst; if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) { saddr = x->coaddr; daddr = prev_daddr; } if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) { saddr = prev_saddr; daddr = x->coaddr; } dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark); if (!IS_ERR(dst)) { if (prev_saddr != saddr) memcpy(prev_saddr, saddr, sizeof(*prev_saddr)); if (prev_daddr != daddr) memcpy(prev_daddr, daddr, sizeof(*prev_daddr)); } return dst; } static inline unsigned long make_jiffies(long secs) { if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) return MAX_SCHEDULE_TIMEOUT-1; else return secs*HZ; } static void xfrm_policy_timer(struct timer_list *t) { struct xfrm_policy *xp = from_timer(xp, t, timer); time64_t now = ktime_get_real_seconds(); time64_t next = TIME64_MAX; int warn = 0; int dir; read_lock(&xp->lock); if (unlikely(xp->walk.dead)) goto out; dir = xfrm_policy_id2dir(xp->index); if (xp->lft.hard_add_expires_seconds) { time64_t tmo = xp->lft.hard_add_expires_seconds + xp->curlft.add_time - now; if (tmo <= 0) goto expired; if (tmo < next) next = tmo; } if (xp->lft.hard_use_expires_seconds) { time64_t tmo = xp->lft.hard_use_expires_seconds + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) goto expired; if (tmo < next) next = tmo; } if (xp->lft.soft_add_expires_seconds) { time64_t tmo = xp->lft.soft_add_expires_seconds + xp->curlft.add_time - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; } if (tmo < next) next = tmo; } if (xp->lft.soft_use_expires_seconds) { time64_t tmo = xp->lft.soft_use_expires_seconds + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; } if (tmo < next) next = tmo; } if (warn) km_policy_expired(xp, dir, 0, 0); if (next != TIME64_MAX && !mod_timer(&xp->timer, jiffies + make_jiffies(next))) xfrm_pol_hold(xp); out: read_unlock(&xp->lock); xfrm_pol_put(xp); return; expired: read_unlock(&xp->lock); if (!xfrm_policy_delete(xp, dir)) km_policy_expired(xp, dir, 1, 0); xfrm_pol_put(xp); } /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2 * SPD calls. */ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) { struct xfrm_policy *policy; policy = kzalloc(sizeof(struct xfrm_policy), gfp); if (policy) { write_pnet(&policy->xp_net, net); INIT_LIST_HEAD(&policy->walk.all); INIT_HLIST_NODE(&policy->bydst_inexact_list); INIT_HLIST_NODE(&policy->bydst); INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); refcount_set(&policy->refcnt, 1); skb_queue_head_init(&policy->polq.hold_queue); timer_setup(&policy->timer, xfrm_policy_timer, 0); timer_setup(&policy->polq.hold_timer, xfrm_policy_queue_process, 0); } return policy; } EXPORT_SYMBOL(xfrm_policy_alloc); static void xfrm_policy_destroy_rcu(struct rcu_head *head) { struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu); security_xfrm_policy_free(policy->security); kfree(policy); } /* Destroy xfrm_policy: descendant resources must be released to this moment. */ void xfrm_policy_destroy(struct xfrm_policy *policy) { BUG_ON(!policy->walk.dead); if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer)) BUG(); xfrm_dev_policy_free(policy); call_rcu(&policy->rcu, xfrm_policy_destroy_rcu); } EXPORT_SYMBOL(xfrm_policy_destroy); /* Rule must be locked. Release descendant resources, announce * entry dead. The rule must be unlinked from lists to the moment. */ static void xfrm_policy_kill(struct xfrm_policy *policy) { write_lock_bh(&policy->lock); policy->walk.dead = 1; write_unlock_bh(&policy->lock); atomic_inc(&policy->genid); if (del_timer(&policy->polq.hold_timer)) xfrm_pol_put(policy); skb_queue_purge(&policy->polq.hold_queue); if (del_timer(&policy->timer)) xfrm_pol_put(policy); xfrm_pol_put(policy); } static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024; static inline unsigned int idx_hash(struct net *net, u32 index) { return __idx_hash(index, net->xfrm.policy_idx_hmask); } /* calculate policy hash thresholds */ static void __get_hash_thresh(struct net *net, unsigned short family, int dir, u8 *dbits, u8 *sbits) { switch (family) { case AF_INET: *dbits = net->xfrm.policy_bydst[dir].dbits4; *sbits = net->xfrm.policy_bydst[dir].sbits4; break; case AF_INET6: *dbits = net->xfrm.policy_bydst[dir].dbits6; *sbits = net->xfrm.policy_bydst[dir].sbits6; break; default: *dbits = 0; *sbits = 0; } } static struct hlist_head *policy_hash_bysel(struct net *net, const struct xfrm_selector *sel, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int hash; u8 dbits; u8 sbits; __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __sel_hash(sel, family, hmask, dbits, sbits); if (hash == hmask + 1) return NULL; return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static struct hlist_head *policy_hash_direct(struct net *net, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int hash; u8 dbits; u8 sbits; __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits); return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static void xfrm_dst_hash_transfer(struct net *net, struct hlist_head *list, struct hlist_head *ndsttable, unsigned int nhashmask, int dir) { struct hlist_node *tmp, *entry0 = NULL; struct xfrm_policy *pol; unsigned int h0 = 0; u8 dbits; u8 sbits; redo: hlist_for_each_entry_safe(pol, tmp, list, bydst) { unsigned int h; __get_hash_thresh(net, pol->family, dir, &dbits, &sbits); h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, pol->family, nhashmask, dbits, sbits); if (!entry0 || pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { hlist_del_rcu(&pol->bydst); hlist_add_head_rcu(&pol->bydst, ndsttable + h); h0 = h; } else { if (h != h0) continue; hlist_del_rcu(&pol->bydst); hlist_add_behind_rcu(&pol->bydst, entry0); } entry0 = &pol->bydst; } if (!hlist_empty(list)) { entry0 = NULL; goto redo; } } static void xfrm_idx_hash_transfer(struct hlist_head *list, struct hlist_head *nidxtable, unsigned int nhashmask) { struct hlist_node *tmp; struct xfrm_policy *pol; hlist_for_each_entry_safe(pol, tmp, list, byidx) { unsigned int h; h = __idx_hash(pol->index, nhashmask); hlist_add_head(&pol->byidx, nidxtable+h); } } static unsigned long xfrm_new_hash_mask(unsigned int old_hmask) { return ((old_hmask + 1) << 1) - 1; } static void xfrm_bydst_resize(struct net *net, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); struct hlist_head *ndst = xfrm_hash_alloc(nsize); struct hlist_head *odst; int i; if (!ndst) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)); for (i = hmask; i >= 0; i--) xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir); rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst); net->xfrm.policy_bydst[dir].hmask = nhashmask; write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); synchronize_rcu(); xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head)); } static void xfrm_byidx_resize(struct net *net) { unsigned int hmask = net->xfrm.policy_idx_hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); struct hlist_head *oidx = net->xfrm.policy_byidx; struct hlist_head *nidx = xfrm_hash_alloc(nsize); int i; if (!nidx) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); for (i = hmask; i >= 0; i--) xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask); net->xfrm.policy_byidx = nidx; net->xfrm.policy_idx_hmask = nhashmask; spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head)); } static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total) { unsigned int cnt = net->xfrm.policy_count[dir]; unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; if (total) *total += cnt; if ((hmask + 1) < xfrm_policy_hashmax && cnt > hmask) return 1; return 0; } static inline int xfrm_byidx_should_resize(struct net *net, int total) { unsigned int hmask = net->xfrm.policy_idx_hmask; if ((hmask + 1) < xfrm_policy_hashmax && total > hmask) return 1; return 0; } void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si) { si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN]; si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT]; si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD]; si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX]; si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX]; si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX]; si->spdhcnt = net->xfrm.policy_idx_hmask; si->spdhmcnt = xfrm_policy_hashmax; } EXPORT_SYMBOL(xfrm_spd_getinfo); static DEFINE_MUTEX(hash_resize_mutex); static void xfrm_hash_resize(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.policy_hash_work); int dir, total; mutex_lock(&hash_resize_mutex); total = 0; for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { if (xfrm_bydst_should_resize(net, dir, &total)) xfrm_bydst_resize(net, dir); } if (xfrm_byidx_should_resize(net, total)) xfrm_byidx_resize(net); mutex_unlock(&hash_resize_mutex); } /* Make sure *pol can be inserted into fastbin. * Useful to check that later insert requests will be successful * (provided xfrm_policy_lock is held throughout). */ static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) { struct xfrm_pol_inexact_bin *bin, *prev; struct xfrm_pol_inexact_key k = { .family = pol->family, .type = pol->type, .dir = dir, .if_id = pol->if_id, }; struct net *net = xp_net(pol); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); write_pnet(&k.net, net); bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k, xfrm_pol_inexact_params); if (bin) return bin; bin = kzalloc(sizeof(*bin), GFP_ATOMIC); if (!bin) return NULL; bin->k = k; INIT_HLIST_HEAD(&bin->hhead); bin->root_d = RB_ROOT; bin->root_s = RB_ROOT; seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock); prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table, &bin->k, &bin->head, xfrm_pol_inexact_params); if (!prev) { list_add(&bin->inexact_bins, &net->xfrm.inexact_bins); return bin; } kfree(bin); return IS_ERR(prev) ? NULL : prev; } static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr, int family, u8 prefixlen) { if (xfrm_addr_any(addr, family)) return true; if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6) return true; if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4) return true; return false; } static bool xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy) { const xfrm_address_t *addr; bool saddr_any, daddr_any; u8 prefixlen; addr = &policy->selector.saddr; prefixlen = policy->selector.prefixlen_s; saddr_any = xfrm_pol_inexact_addr_use_any_list(addr, policy->family, prefixlen); addr = &policy->selector.daddr; prefixlen = policy->selector.prefixlen_d; daddr_any = xfrm_pol_inexact_addr_use_any_list(addr, policy->family, prefixlen); return saddr_any && daddr_any; } static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node, const xfrm_address_t *addr, u8 prefixlen) { node->addr = *addr; node->prefixlen = prefixlen; } static struct xfrm_pol_inexact_node * xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen) { struct xfrm_pol_inexact_node *node; node = kzalloc(sizeof(*node), GFP_ATOMIC); if (node) xfrm_pol_inexact_node_init(node, addr, prefixlen); return node; } static int xfrm_policy_addr_delta(const xfrm_address_t *a, const xfrm_address_t *b, u8 prefixlen, u16 family) { u32 ma, mb, mask; unsigned int pdw, pbi; int delta = 0; switch (family) { case AF_INET: if (prefixlen == 0) return 0; mask = ~0U << (32 - prefixlen); ma = ntohl(a->a4) & mask; mb = ntohl(b->a4) & mask; if (ma < mb) delta = -1; else if (ma > mb) delta = 1; break; case AF_INET6: pdw = prefixlen >> 5; pbi = prefixlen & 0x1f; if (pdw) { delta = memcmp(a->a6, b->a6, pdw << 2); if (delta) return delta; } if (pbi) { mask = ~0U << (32 - pbi); ma = ntohl(a->a6[pdw]) & mask; mb = ntohl(b->a6[pdw]) & mask; if (ma < mb) delta = -1; else if (ma > mb) delta = 1; } break; default: break; } return delta; } static void xfrm_policy_inexact_list_reinsert(struct net *net, struct xfrm_pol_inexact_node *n, u16 family) { unsigned int matched_s, matched_d; struct xfrm_policy *policy, *p; matched_s = 0; matched_d = 0; list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { struct hlist_node *newpos = NULL; bool matches_s, matches_d; if (policy->walk.dead || !policy->bydst_reinsert) continue; WARN_ON_ONCE(policy->family != family); policy->bydst_reinsert = false; hlist_for_each_entry(p, &n->hhead, bydst) { if (policy->priority > p->priority) newpos = &p->bydst; else if (policy->priority == p->priority && policy->pos > p->pos) newpos = &p->bydst; else break; } if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, &n->hhead); /* paranoia checks follow. * Check that the reinserted policy matches at least * saddr or daddr for current node prefix. * * Matching both is fine, matching saddr in one policy * (but not daddr) and then matching only daddr in another * is a bug. */ matches_s = xfrm_policy_addr_delta(&policy->selector.saddr, &n->addr, n->prefixlen, family) == 0; matches_d = xfrm_policy_addr_delta(&policy->selector.daddr, &n->addr, n->prefixlen, family) == 0; if (matches_s && matches_d) continue; WARN_ON_ONCE(!matches_s && !matches_d); if (matches_s) matched_s++; if (matches_d) matched_d++; WARN_ON_ONCE(matched_s && matched_d); } } static void xfrm_policy_inexact_node_reinsert(struct net *net, struct xfrm_pol_inexact_node *n, struct rb_root *new, u16 family) { struct xfrm_pol_inexact_node *node; struct rb_node **p, *parent; /* we should not have another subtree here */ WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root)); restart: parent = NULL; p = &new->rb_node; while (*p) { u8 prefixlen; int delta; parent = *p; node = rb_entry(*p, struct xfrm_pol_inexact_node, node); prefixlen = min(node->prefixlen, n->prefixlen); delta = xfrm_policy_addr_delta(&n->addr, &node->addr, prefixlen, family); if (delta < 0) { p = &parent->rb_left; } else if (delta > 0) { p = &parent->rb_right; } else { bool same_prefixlen = node->prefixlen == n->prefixlen; struct xfrm_policy *tmp; hlist_for_each_entry(tmp, &n->hhead, bydst) { tmp->bydst_reinsert = true; hlist_del_rcu(&tmp->bydst); } node->prefixlen = prefixlen; xfrm_policy_inexact_list_reinsert(net, node, family); if (same_prefixlen) { kfree_rcu(n, rcu); return; } rb_erase(*p, new); kfree_rcu(n, rcu); n = node; goto restart; } } rb_link_node_rcu(&n->node, parent, p); rb_insert_color(&n->node, new); } /* merge nodes v and n */ static void xfrm_policy_inexact_node_merge(struct net *net, struct xfrm_pol_inexact_node *v, struct xfrm_pol_inexact_node *n, u16 family) { struct xfrm_pol_inexact_node *node; struct xfrm_policy *tmp; struct rb_node *rnode; /* To-be-merged node v has a subtree. * * Dismantle it and insert its nodes to n->root. */ while ((rnode = rb_first(&v->root)) != NULL) { node = rb_entry(rnode, struct xfrm_pol_inexact_node, node); rb_erase(&node->node, &v->root); xfrm_policy_inexact_node_reinsert(net, node, &n->root, family); } hlist_for_each_entry(tmp, &v->hhead, bydst) { tmp->bydst_reinsert = true; hlist_del_rcu(&tmp->bydst); } xfrm_policy_inexact_list_reinsert(net, n, family); } static struct xfrm_pol_inexact_node * xfrm_policy_inexact_insert_node(struct net *net, struct rb_root *root, xfrm_address_t *addr, u16 family, u8 prefixlen, u8 dir) { struct xfrm_pol_inexact_node *cached = NULL; struct rb_node **p, *parent = NULL; struct xfrm_pol_inexact_node *node; p = &root->rb_node; while (*p) { int delta; parent = *p; node = rb_entry(*p, struct xfrm_pol_inexact_node, node); delta = xfrm_policy_addr_delta(addr, &node->addr, node->prefixlen, family); if (delta == 0 && prefixlen >= node->prefixlen) { WARN_ON_ONCE(cached); /* ipsec policies got lost */ return node; } if (delta < 0) p = &parent->rb_left; else p = &parent->rb_right; if (prefixlen < node->prefixlen) { delta = xfrm_policy_addr_delta(addr, &node->addr, prefixlen, family); if (delta) continue; /* This node is a subnet of the new prefix. It needs * to be removed and re-inserted with the smaller * prefix and all nodes that are now also covered * by the reduced prefixlen. */ rb_erase(&node->node, root); if (!cached) { xfrm_pol_inexact_node_init(node, addr, prefixlen); cached = node; } else { /* This node also falls within the new * prefixlen. Merge the to-be-reinserted * node and this one. */ xfrm_policy_inexact_node_merge(net, node, cached, family); kfree_rcu(node, rcu); } /* restart */ p = &root->rb_node; parent = NULL; } } node = cached; if (!node) { node = xfrm_pol_inexact_node_alloc(addr, prefixlen); if (!node) return NULL; } rb_link_node_rcu(&node->node, parent, p); rb_insert_color(&node->node, root); return node; } static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm) { struct xfrm_pol_inexact_node *node; struct rb_node *rn = rb_first(r); while (rn) { node = rb_entry(rn, struct xfrm_pol_inexact_node, node); xfrm_policy_inexact_gc_tree(&node->root, rm); rn = rb_next(rn); if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) { WARN_ON_ONCE(rm); continue; } rb_erase(&node->node, r); kfree_rcu(node, rcu); } } static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit) { write_seqcount_begin(&b->count); xfrm_policy_inexact_gc_tree(&b->root_d, net_exit); xfrm_policy_inexact_gc_tree(&b->root_s, net_exit); write_seqcount_end(&b->count); if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) || !hlist_empty(&b->hhead)) { WARN_ON_ONCE(net_exit); return; } if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head, xfrm_pol_inexact_params) == 0) { list_del(&b->inexact_bins); kfree_rcu(b, rcu); } } static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b) { struct net *net = read_pnet(&b->k.net); spin_lock_bh(&net->xfrm.xfrm_policy_lock); __xfrm_policy_inexact_prune_bin(b, false); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } static void __xfrm_policy_inexact_flush(struct net *net) { struct xfrm_pol_inexact_bin *bin, *t; lockdep_assert_held(&net->xfrm.xfrm_policy_lock); list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins) __xfrm_policy_inexact_prune_bin(bin, false); } static struct hlist_head * xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin, struct xfrm_policy *policy, u8 dir) { struct xfrm_pol_inexact_node *n; struct net *net; net = xp_net(policy); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); if (xfrm_policy_inexact_insert_use_any_list(policy)) return &bin->hhead; if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr, policy->family, policy->selector.prefixlen_d)) { write_seqcount_begin(&bin->count); n = xfrm_policy_inexact_insert_node(net, &bin->root_s, &policy->selector.saddr, policy->family, policy->selector.prefixlen_s, dir); write_seqcount_end(&bin->count); if (!n) return NULL; return &n->hhead; } /* daddr is fixed */ write_seqcount_begin(&bin->count); n = xfrm_policy_inexact_insert_node(net, &bin->root_d, &policy->selector.daddr, policy->family, policy->selector.prefixlen_d, dir); write_seqcount_end(&bin->count); if (!n) return NULL; /* saddr is wildcard */ if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr, policy->family, policy->selector.prefixlen_s)) return &n->hhead; write_seqcount_begin(&bin->count); n = xfrm_policy_inexact_insert_node(net, &n->root, &policy->selector.saddr, policy->family, policy->selector.prefixlen_s, dir); write_seqcount_end(&bin->count); if (!n) return NULL; return &n->hhead; } static struct xfrm_policy * xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) { struct xfrm_pol_inexact_bin *bin; struct xfrm_policy *delpol; struct hlist_head *chain; struct net *net; bin = xfrm_policy_inexact_alloc_bin(policy, dir); if (!bin) return ERR_PTR(-ENOMEM); net = xp_net(policy); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir); if (!chain) { __xfrm_policy_inexact_prune_bin(bin, false); return ERR_PTR(-ENOMEM); } delpol = xfrm_policy_insert_list(chain, policy, excl); if (delpol && excl) { __xfrm_policy_inexact_prune_bin(bin, false); return ERR_PTR(-EEXIST); } chain = &net->xfrm.policy_inexact[dir]; xfrm_policy_insert_inexact_list(chain, policy); if (delpol) __xfrm_policy_inexact_prune_bin(bin, false); return delpol; } static void xfrm_hash_rebuild(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.policy_hthresh.work); unsigned int hmask; struct xfrm_policy *pol; struct xfrm_policy *policy; struct hlist_head *chain; struct hlist_head *odst; struct hlist_node *newpos; int i; int dir; unsigned seq; u8 lbits4, rbits4, lbits6, rbits6; mutex_lock(&hash_resize_mutex); /* read selector prefixlen thresholds */ do { seq = read_seqbegin(&net->xfrm.policy_hthresh.lock); lbits4 = net->xfrm.policy_hthresh.lbits4; rbits4 = net->xfrm.policy_hthresh.rbits4; lbits6 = net->xfrm.policy_hthresh.lbits6; rbits6 = net->xfrm.policy_hthresh.rbits6; } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); /* make sure that we can insert the indirect policies again before * we start with destructive action. */ list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) { struct xfrm_pol_inexact_bin *bin; u8 dbits, sbits; if (policy->walk.dead) continue; dir = xfrm_policy_id2dir(policy->index); if (dir >= XFRM_POLICY_MAX) continue; if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { if (policy->family == AF_INET) { dbits = rbits4; sbits = lbits4; } else { dbits = rbits6; sbits = lbits6; } } else { if (policy->family == AF_INET) { dbits = lbits4; sbits = rbits4; } else { dbits = lbits6; sbits = rbits6; } } if (policy->selector.prefixlen_d < dbits || policy->selector.prefixlen_s < sbits) continue; bin = xfrm_policy_inexact_alloc_bin(policy, dir); if (!bin) goto out_unlock; if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir)) goto out_unlock; } /* reset the bydst and inexact table in all directions */ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct hlist_node *n; hlist_for_each_entry_safe(policy, n, &net->xfrm.policy_inexact[dir], bydst_inexact_list) { hlist_del_rcu(&policy->bydst); hlist_del_init(&policy->bydst_inexact_list); } hmask = net->xfrm.policy_bydst[dir].hmask; odst = net->xfrm.policy_bydst[dir].table; for (i = hmask; i >= 0; i--) { hlist_for_each_entry_safe(policy, n, odst + i, bydst) hlist_del_rcu(&policy->bydst); } if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { /* dir out => dst = remote, src = local */ net->xfrm.policy_bydst[dir].dbits4 = rbits4; net->xfrm.policy_bydst[dir].sbits4 = lbits4; net->xfrm.policy_bydst[dir].dbits6 = rbits6; net->xfrm.policy_bydst[dir].sbits6 = lbits6; } else { /* dir in/fwd => dst = local, src = remote */ net->xfrm.policy_bydst[dir].dbits4 = lbits4; net->xfrm.policy_bydst[dir].sbits4 = rbits4; net->xfrm.policy_bydst[dir].dbits6 = lbits6; net->xfrm.policy_bydst[dir].sbits6 = rbits6; } } /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { if (policy->walk.dead) continue; dir = xfrm_policy_id2dir(policy->index); if (dir >= XFRM_POLICY_MAX) { /* skip socket policies */ continue; } newpos = NULL; chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); if (!chain) { void *p = xfrm_policy_inexact_insert(policy, dir, 0); WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p)); continue; } hlist_for_each_entry(pol, chain, bydst) { if (policy->priority >= pol->priority) newpos = &pol->bydst; else break; } if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, chain); } out_unlock: __xfrm_policy_inexact_flush(net); write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); } void xfrm_policy_hash_rebuild(struct net *net) { schedule_work(&net->xfrm.policy_hthresh.work); } EXPORT_SYMBOL(xfrm_policy_hash_rebuild); /* Generate new index... KAME seems to generate them ordered by cost * of an absolute inpredictability of ordering of rules. This will not pass. */ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) { for (;;) { struct hlist_head *list; struct xfrm_policy *p; u32 idx; int found; if (!index) { idx = (net->xfrm.idx_generator | dir); net->xfrm.idx_generator += 8; } else { idx = index; index = 0; } if (idx == 0) idx = 8; list = net->xfrm.policy_byidx + idx_hash(net, idx); found = 0; hlist_for_each_entry(p, list, byidx) { if (p->index == idx) { found = 1; break; } } if (!found) return idx; } } static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2) { u32 *p1 = (u32 *) s1; u32 *p2 = (u32 *) s2; int len = sizeof(struct xfrm_selector) / sizeof(u32); int i; for (i = 0; i < len; i++) { if (p1[i] != p2[i]) return 1; } return 0; } static void xfrm_policy_requeue(struct xfrm_policy *old, struct xfrm_policy *new) { struct xfrm_policy_queue *pq = &old->polq; struct sk_buff_head list; if (skb_queue_empty(&pq->hold_queue)) return; __skb_queue_head_init(&list); spin_lock_bh(&pq->hold_queue.lock); skb_queue_splice_init(&pq->hold_queue, &list); if (del_timer(&pq->hold_timer)) xfrm_pol_put(old); spin_unlock_bh(&pq->hold_queue.lock); pq = &new->polq; spin_lock_bh(&pq->hold_queue.lock); skb_queue_splice(&list, &pq->hold_queue); pq->timeout = XFRM_QUEUE_TMO_MIN; if (!mod_timer(&pq->hold_timer, jiffies)) xfrm_pol_hold(new); spin_unlock_bh(&pq->hold_queue.lock); } static inline bool xfrm_policy_mark_match(const struct xfrm_mark *mark, struct xfrm_policy *pol) { return mark->v == pol->mark.v && mark->m == pol->mark.m; } static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed) { const struct xfrm_pol_inexact_key *k = data; u32 a = k->type << 24 | k->dir << 16 | k->family; return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)), seed); } static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed) { const struct xfrm_pol_inexact_bin *b = data; return xfrm_pol_bin_key(&b->k, 0, seed); } static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct xfrm_pol_inexact_key *key = arg->key; const struct xfrm_pol_inexact_bin *b = ptr; int ret; if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net))) return -1; ret = b->k.dir ^ key->dir; if (ret) return ret; ret = b->k.type ^ key->type; if (ret) return ret; ret = b->k.family ^ key->family; if (ret) return ret; return b->k.if_id ^ key->if_id; } static const struct rhashtable_params xfrm_pol_inexact_params = { .head_offset = offsetof(struct xfrm_pol_inexact_bin, head), .hashfn = xfrm_pol_bin_key, .obj_hashfn = xfrm_pol_bin_obj, .obj_cmpfn = xfrm_pol_bin_cmp, .automatic_shrinking = true, }; static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, struct xfrm_policy *policy) { struct xfrm_policy *pol, *delpol = NULL; struct hlist_node *newpos = NULL; int i = 0; hlist_for_each_entry(pol, chain, bydst_inexact_list) { if (pol->type == policy->type && pol->if_id == policy->if_id && !selector_cmp(&pol->selector, &policy->selector) && xfrm_policy_mark_match(&policy->mark, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { delpol = pol; if (policy->priority > pol->priority) continue; } else if (policy->priority >= pol->priority) { newpos = &pol->bydst_inexact_list; continue; } if (delpol) break; } if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos); else hlist_add_head_rcu(&policy->bydst_inexact_list, chain); hlist_for_each_entry(pol, chain, bydst_inexact_list) { pol->pos = i; i++; } } static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl) { struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == policy->type && pol->if_id == policy->if_id && !selector_cmp(&pol->selector, &policy->selector) && xfrm_policy_mark_match(&policy->mark, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { if (excl) return ERR_PTR(-EEXIST); delpol = pol; if (policy->priority > pol->priority) continue; } else if (policy->priority >= pol->priority) { newpos = pol; continue; } if (delpol) break; } if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, &newpos->bydst); else /* Packet offload policies enter to the head * to speed-up lookups. */ hlist_add_head_rcu(&policy->bydst, chain); return delpol; } int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) { struct net *net = xp_net(policy); struct xfrm_policy *delpol; struct hlist_head *chain; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); if (chain) delpol = xfrm_policy_insert_list(chain, policy, excl); else delpol = xfrm_policy_inexact_insert(policy, dir, excl); if (IS_ERR(delpol)) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return PTR_ERR(delpol); } __xfrm_policy_link(policy, dir); /* After previous checking, family can either be AF_INET or AF_INET6 */ if (policy->family == AF_INET) rt_genid_bump_ipv4(net); else rt_genid_bump_ipv6(net); if (delpol) { xfrm_policy_requeue(delpol, policy); __xfrm_policy_unlink(delpol, dir); } policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index); hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index)); policy->curlft.add_time = ktime_get_real_seconds(); policy->curlft.use_time = 0; if (!mod_timer(&policy->timer, jiffies + HZ)) xfrm_pol_hold(policy); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (delpol) xfrm_policy_kill(delpol); else if (xfrm_bydst_should_resize(net, dir, NULL)) schedule_work(&net->xfrm.policy_hash_work); return 0; } EXPORT_SYMBOL(xfrm_policy_insert); static struct xfrm_policy * __xfrm_policy_bysel_ctx(struct hlist_head *chain, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx) { struct xfrm_policy *pol; if (!chain) return NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == type && pol->if_id == if_id && xfrm_policy_mark_match(mark, pol) && !selector_cmp(sel, &pol->selector) && xfrm_sec_ctx_match(ctx, pol->security)) return pol; } return NULL; } struct xfrm_policy * xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err) { struct xfrm_pol_inexact_bin *bin = NULL; struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); if (!chain) { struct xfrm_pol_inexact_candidates cand; int i; bin = xfrm_policy_inexact_lookup(net, type, sel->family, dir, if_id); if (!bin) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return NULL; } if (!xfrm_policy_find_inexact_candidates(&cand, bin, &sel->saddr, &sel->daddr)) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return NULL; } pol = NULL; for (i = 0; i < ARRAY_SIZE(cand.res); i++) { struct xfrm_policy *tmp; tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark, if_id, type, dir, sel, ctx); if (!tmp) continue; if (!pol || tmp->pos < pol->pos) pol = tmp; } } else { pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir, sel, ctx); } if (pol) { xfrm_pol_hold(pol); if (delete) { *err = security_xfrm_policy_delete(pol->security); if (*err) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); } ret = pol; } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); if (bin && delete) xfrm_policy_inexact_prune_bin(bin); return ret; } EXPORT_SYMBOL(xfrm_policy_bysel_ctx); struct xfrm_policy * xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, u32 id, int delete, int *err) { struct xfrm_policy *pol, *ret; struct hlist_head *chain; *err = -ENOENT; if (xfrm_policy_id2dir(id) != dir) return NULL; *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = net->xfrm.policy_byidx + idx_hash(net, id); ret = NULL; hlist_for_each_entry(pol, chain, byidx) { if (pol->type == type && pol->index == id && pol->if_id == if_id && xfrm_policy_mark_match(mark, pol)) { xfrm_pol_hold(pol); if (delete) { *err = security_xfrm_policy_delete( pol->security); if (*err) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); } ret = pol; break; } } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); return ret; } EXPORT_SYMBOL(xfrm_policy_byid); #ifdef CONFIG_SECURITY_NETWORK_XFRM static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { struct xfrm_policy *pol; int err = 0; list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { if (pol->walk.dead || xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX || pol->type != type) continue; err = security_xfrm_policy_delete(pol->security); if (err) { xfrm_audit_policy_delete(pol, 0, task_valid); return err; } } return err; } static inline int xfrm_dev_policy_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) { struct xfrm_policy *pol; int err = 0; list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { if (pol->walk.dead || xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX || pol->xdo.dev != dev) continue; err = security_xfrm_policy_delete(pol->security); if (err) { xfrm_audit_policy_delete(pol, 0, task_valid); return err; } } return err; } #else static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { return 0; } static inline int xfrm_dev_policy_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) { return 0; } #endif int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) { int dir, err = 0, cnt = 0; struct xfrm_policy *pol; spin_lock_bh(&net->xfrm.xfrm_policy_lock); err = xfrm_policy_flush_secctx_check(net, type, task_valid); if (err) goto out; again: list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { if (pol->walk.dead) continue; dir = xfrm_policy_id2dir(pol->index); if (dir >= XFRM_POLICY_MAX || pol->type != type) continue; __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_dev_policy_delete(pol); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again; } if (cnt) __xfrm_policy_inexact_flush(net); else err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; } EXPORT_SYMBOL(xfrm_policy_flush); int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, bool task_valid) { int dir, err = 0, cnt = 0; struct xfrm_policy *pol; spin_lock_bh(&net->xfrm.xfrm_policy_lock); err = xfrm_dev_policy_flush_secctx_check(net, dev, task_valid); if (err) goto out; again: list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { if (pol->walk.dead) continue; dir = xfrm_policy_id2dir(pol->index); if (dir >= XFRM_POLICY_MAX || pol->xdo.dev != dev) continue; __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_dev_policy_delete(pol); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again; } if (cnt) __xfrm_policy_inexact_flush(net); else err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; } EXPORT_SYMBOL(xfrm_dev_policy_flush); int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, int (*func)(struct xfrm_policy *, int, int, void*), void *data) { struct xfrm_policy *pol; struct xfrm_policy_walk_entry *x; int error = 0; if (walk->type >= XFRM_POLICY_TYPE_MAX && walk->type != XFRM_POLICY_TYPE_ANY) return -EINVAL; if (list_empty(&walk->walk.all) && walk->seq != 0) return 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); if (list_empty(&walk->walk.all)) x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all); else x = list_first_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all); list_for_each_entry_from(x, &net->xfrm.policy_all, all) { if (x->dead) continue; pol = container_of(x, struct xfrm_policy, walk); if (walk->type != XFRM_POLICY_TYPE_ANY && walk->type != pol->type) continue; error = func(pol, xfrm_policy_id2dir(pol->index), walk->seq, data); if (error) { list_move_tail(&walk->walk.all, &x->all); goto out; } walk->seq++; } if (walk->seq == 0) { error = -ENOENT; goto out; } list_del_init(&walk->walk.all); out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return error; } EXPORT_SYMBOL(xfrm_policy_walk); void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type) { INIT_LIST_HEAD(&walk->walk.all); walk->walk.dead = 1; walk->type = type; walk->seq = 0; } EXPORT_SYMBOL(xfrm_policy_walk_init); void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net) { if (list_empty(&walk->walk.all)) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */ list_del(&walk->walk.all); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } EXPORT_SYMBOL(xfrm_policy_walk_done); /* * Find policy to apply to this flow. * * Returns 0 if policy found, else an -errno. */ static int xfrm_policy_match(const struct xfrm_policy *pol, const struct flowi *fl, u8 type, u16 family, u32 if_id) { const struct xfrm_selector *sel = &pol->selector; int ret = -ESRCH; bool match; if (pol->family != family || pol->if_id != if_id || (fl->flowi_mark & pol->mark.m) != pol->mark.v || pol->type != type) return ret; match = xfrm_selector_match(sel, fl, family); if (match) ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid); return ret; } static struct xfrm_pol_inexact_node * xfrm_policy_lookup_inexact_addr(const struct rb_root *r, seqcount_spinlock_t *count, const xfrm_address_t *addr, u16 family) { const struct rb_node *parent; int seq; again: seq = read_seqcount_begin(count); parent = rcu_dereference_raw(r->rb_node); while (parent) { struct xfrm_pol_inexact_node *node; int delta; node = rb_entry(parent, struct xfrm_pol_inexact_node, node); delta = xfrm_policy_addr_delta(addr, &node->addr, node->prefixlen, family); if (delta < 0) { parent = rcu_dereference_raw(parent->rb_left); continue; } else if (delta > 0) { parent = rcu_dereference_raw(parent->rb_right); continue; } return node; } if (read_seqcount_retry(count, seq)) goto again; return NULL; } static bool xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_pol_inexact_bin *b, const xfrm_address_t *saddr, const xfrm_address_t *daddr) { struct xfrm_pol_inexact_node *n; u16 family; if (!b) return false; family = b->k.family; memset(cand, 0, sizeof(*cand)); cand->res[XFRM_POL_CAND_ANY] = &b->hhead; n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr, family); if (n) { cand->res[XFRM_POL_CAND_DADDR] = &n->hhead; n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr, family); if (n) cand->res[XFRM_POL_CAND_BOTH] = &n->hhead; } n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr, family); if (n) cand->res[XFRM_POL_CAND_SADDR] = &n->hhead; return true; } static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir, u32 if_id) { struct xfrm_pol_inexact_key k = { .family = family, .type = type, .dir = dir, .if_id = if_id, }; write_pnet(&k.net, net); return rhashtable_lookup(&xfrm_policy_inexact_table, &k, xfrm_pol_inexact_params); } static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir, u32 if_id) { struct xfrm_pol_inexact_bin *bin; lockdep_assert_held(&net->xfrm.xfrm_policy_lock); rcu_read_lock(); bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); rcu_read_unlock(); return bin; } static struct xfrm_policy * __xfrm_policy_eval_candidates(struct hlist_head *chain, struct xfrm_policy *prefer, const struct flowi *fl, u8 type, u16 family, u32 if_id) { u32 priority = prefer ? prefer->priority : ~0u; struct xfrm_policy *pol; if (!chain) return NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { int err; if (pol->priority > priority) break; err = xfrm_policy_match(pol, fl, type, family, if_id); if (err) { if (err != -ESRCH) return ERR_PTR(err); continue; } if (prefer) { /* matches. Is it older than *prefer? */ if (pol->priority == priority && prefer->pos < pol->pos) return prefer; } return pol; } return NULL; } static struct xfrm_policy * xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_policy *prefer, const struct flowi *fl, u8 type, u16 family, u32 if_id) { struct xfrm_policy *tmp; int i; for (i = 0; i < ARRAY_SIZE(cand->res); i++) { tmp = __xfrm_policy_eval_candidates(cand->res[i], prefer, fl, type, family, if_id); if (!tmp) continue; if (IS_ERR(tmp)) return tmp; prefer = tmp; } return prefer; } static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { struct xfrm_pol_inexact_candidates cand; const xfrm_address_t *daddr, *saddr; struct xfrm_pol_inexact_bin *bin; struct xfrm_policy *pol, *ret; struct hlist_head *chain; unsigned int sequence; int err; daddr = xfrm_flowi_daddr(fl, family); saddr = xfrm_flowi_saddr(fl, family); if (unlikely(!daddr || !saddr)) return NULL; rcu_read_lock(); retry: do { sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); chain = policy_hash_direct(net, daddr, saddr, family, dir); } while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)); ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { err = xfrm_policy_match(pol, fl, type, family, if_id); if (err) { if (err == -ESRCH) continue; else { ret = ERR_PTR(err); goto fail; } } else { ret = pol; break; } } if (ret && ret->xdo.type == XFRM_DEV_OFFLOAD_PACKET) goto skip_inexact; bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr, daddr)) goto skip_inexact; pol = xfrm_policy_eval_candidates(&cand, ret, fl, type, family, if_id); if (pol) { ret = pol; if (IS_ERR(pol)) goto fail; } skip_inexact: if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)) goto retry; if (ret && !xfrm_pol_hold_rcu(ret)) goto retry; fail: rcu_read_unlock(); return ret; } static struct xfrm_policy *xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { #ifdef CONFIG_XFRM_SUB_POLICY struct xfrm_policy *pol; pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir, if_id); if (pol != NULL) return pol; #endif return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir, if_id); } static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, const struct flowi *fl, u16 family, u32 if_id) { struct xfrm_policy *pol; rcu_read_lock(); again: pol = rcu_dereference(sk->sk_policy[dir]); if (pol != NULL) { bool match; int err = 0; if (pol->family != family) { pol = NULL; goto out; } match = xfrm_selector_match(&pol->selector, fl, family); if (match) { if ((READ_ONCE(sk->sk_mark) & pol->mark.m) != pol->mark.v || pol->if_id != if_id) { pol = NULL; goto out; } err = security_xfrm_policy_lookup(pol->security, fl->flowi_secid); if (!err) { if (!xfrm_pol_hold_rcu(pol)) goto again; } else if (err == -ESRCH) { pol = NULL; } else { pol = ERR_PTR(err); } } else pol = NULL; } out: rcu_read_unlock(); return pol; } static void __xfrm_policy_link(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); list_add(&pol->walk.all, &net->xfrm.policy_all); net->xfrm.policy_count[dir]++; xfrm_pol_hold(pol); } static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); if (list_empty(&pol->walk.all)) return NULL; /* Socket policies are not hashed. */ if (!hlist_unhashed(&pol->bydst)) { hlist_del_rcu(&pol->bydst); hlist_del_init(&pol->bydst_inexact_list); hlist_del(&pol->byidx); } list_del_init(&pol->walk.all); net->xfrm.policy_count[dir]--; return pol; } static void xfrm_sk_policy_link(struct xfrm_policy *pol, int dir) { __xfrm_policy_link(pol, XFRM_POLICY_MAX + dir); } static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir) { __xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir); } int xfrm_policy_delete(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); pol = __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (pol) { xfrm_dev_policy_delete(pol); xfrm_policy_kill(pol); return 0; } return -ENOENT; } EXPORT_SYMBOL(xfrm_policy_delete); int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) { struct net *net = sock_net(sk); struct xfrm_policy *old_pol; #ifdef CONFIG_XFRM_SUB_POLICY if (pol && pol->type != XFRM_POLICY_TYPE_MAIN) return -EINVAL; #endif spin_lock_bh(&net->xfrm.xfrm_policy_lock); old_pol = rcu_dereference_protected(sk->sk_policy[dir], lockdep_is_held(&net->xfrm.xfrm_policy_lock)); if (pol) { pol->curlft.add_time = ktime_get_real_seconds(); pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0); xfrm_sk_policy_link(pol, dir); } rcu_assign_pointer(sk->sk_policy[dir], pol); if (old_pol) { if (pol) xfrm_policy_requeue(old_pol, pol); /* Unlinking succeeds always. This is the only function * allowed to delete or replace socket policy. */ xfrm_sk_policy_unlink(old_pol, dir); } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (old_pol) { xfrm_policy_kill(old_pol); } return 0; } static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) { struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC); struct net *net = xp_net(old); if (newp) { newp->selector = old->selector; if (security_xfrm_policy_clone(old->security, &newp->security)) { kfree(newp); return NULL; /* ENOMEM */ } newp->lft = old->lft; newp->curlft = old->curlft; newp->mark = old->mark; newp->if_id = old->if_id; newp->action = old->action; newp->flags = old->flags; newp->xfrm_nr = old->xfrm_nr; newp->index = old->index; newp->type = old->type; newp->family = old->family; memcpy(newp->xfrm_vec, old->xfrm_vec, newp->xfrm_nr*sizeof(struct xfrm_tmpl)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); xfrm_sk_policy_link(newp, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_pol_put(newp); } return newp; } int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { const struct xfrm_policy *p; struct xfrm_policy *np; int i, ret = 0; rcu_read_lock(); for (i = 0; i < 2; i++) { p = rcu_dereference(osk->sk_policy[i]); if (p) { np = clone_policy(p, i); if (unlikely(!np)) { ret = -ENOMEM; break; } rcu_assign_pointer(sk->sk_policy[i], np); } } rcu_read_unlock(); return ret; } static int xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local, xfrm_address_t *remote, unsigned short family, u32 mark) { int err; const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EINVAL; err = afinfo->get_saddr(net, oif, local, remote, mark); rcu_read_unlock(); return err; } /* Resolve list of templates for the flow, given policy. */ static int xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, struct xfrm_state **xfrm, unsigned short family) { struct net *net = xp_net(policy); int nx; int i, error; xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family); xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family); xfrm_address_t tmp; for (nx = 0, i = 0; i < policy->xfrm_nr; i++) { struct xfrm_state *x; xfrm_address_t *remote = daddr; xfrm_address_t *local = saddr; struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; if (tmpl->mode == XFRM_MODE_TUNNEL || tmpl->mode == XFRM_MODE_BEET) { remote = &tmpl->id.daddr; local = &tmpl->saddr; if (xfrm_addr_any(local, tmpl->encap_family)) { error = xfrm_get_saddr(net, fl->flowi_oif, &tmp, remote, tmpl->encap_family, 0); if (error) goto fail; local = &tmp; } } x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family, policy->if_id); if (x && x->dir && x->dir != XFRM_SA_DIR_OUT) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEDIRERROR); xfrm_state_put(x); error = -EINVAL; goto fail; } if (x && x->km.state == XFRM_STATE_VALID) { xfrm[nx++] = x; daddr = remote; saddr = local; continue; } if (x) { error = (x->km.state == XFRM_STATE_ERROR ? -EINVAL : -EAGAIN); xfrm_state_put(x); } else if (error == -ESRCH) { error = -EAGAIN; } if (!tmpl->optional) goto fail; } return nx; fail: for (nx--; nx >= 0; nx--) xfrm_state_put(xfrm[nx]); return error; } static int xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, struct xfrm_state **xfrm, unsigned short family) { struct xfrm_state *tp[XFRM_MAX_DEPTH]; struct xfrm_state **tpp = (npols > 1) ? tp : xfrm; int cnx = 0; int error; int ret; int i; for (i = 0; i < npols; i++) { if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) { error = -ENOBUFS; goto fail; } ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family); if (ret < 0) { error = ret; goto fail; } else cnx += ret; } /* found states are sorted for outbound processing */ if (npols > 1) xfrm_state_sort(xfrm, tpp, cnx, family); return cnx; fail: for (cnx--; cnx >= 0; cnx--) xfrm_state_put(tpp[cnx]); return error; } static int xfrm_get_tos(const struct flowi *fl, int family) { if (family == AF_INET) return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; return 0; } static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); struct dst_ops *dst_ops; struct xfrm_dst *xdst; if (!afinfo) return ERR_PTR(-EINVAL); switch (family) { case AF_INET: dst_ops = &net->xfrm.xfrm4_dst_ops; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: dst_ops = &net->xfrm.xfrm6_dst_ops; break; #endif default: BUG(); } xdst = dst_alloc(dst_ops, NULL, DST_OBSOLETE_NONE, 0); if (likely(xdst)) { memset_after(xdst, 0, u.dst); } else xdst = ERR_PTR(-ENOBUFS); rcu_read_unlock(); return xdst; } static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { if (dst->ops->family == AF_INET6) { path->path_cookie = rt6_get_cookie(dst_rt6_info(dst)); path->u.rt6.rt6i_nfheader_len = nfheader_len; } } static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(xdst->u.dst.ops->family); int err; if (!afinfo) return -EINVAL; err = afinfo->fill_dst(xdst, dev, fl); rcu_read_unlock(); return err; } /* Allocate chain of dst_entry's, attach known xfrm's, calculate * all the metrics... Shortly, bundle a bundle. */ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, struct xfrm_dst **bundle, int nx, const struct flowi *fl, struct dst_entry *dst) { const struct xfrm_state_afinfo *afinfo; const struct xfrm_mode *inner_mode; struct net *net = xp_net(policy); unsigned long now = jiffies; struct net_device *dev; struct xfrm_dst *xdst_prev = NULL; struct xfrm_dst *xdst0 = NULL; int i = 0; int err; int header_len = 0; int nfheader_len = 0; int trailer_len = 0; int tos; int family = policy->selector.family; xfrm_address_t saddr, daddr; xfrm_flowi_addr_get(fl, &saddr, &daddr, family); tos = xfrm_get_tos(fl, family); dst_hold(dst); for (; i < nx; i++) { struct xfrm_dst *xdst = xfrm_alloc_dst(net, family); struct dst_entry *dst1 = &xdst->u.dst; err = PTR_ERR(xdst); if (IS_ERR(xdst)) { dst_release(dst); goto put_states; } bundle[i] = xdst; if (!xdst_prev) xdst0 = xdst; else /* Ref count is taken during xfrm_alloc_dst() * No need to do dst_clone() on dst1 */ xfrm_dst_set_child(xdst_prev, &xdst->u.dst); if (xfrm[i]->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(xfrm[i], xfrm_af2proto(family)); if (!inner_mode) { err = -EAFNOSUPPORT; dst_release(dst); goto put_states; } } else inner_mode = &xfrm[i]->inner_mode; xdst->route = dst; dst_copy_metrics(dst1, dst); if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { __u32 mark = 0; int oif; if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m) mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]); if (xfrm[i]->xso.type != XFRM_DEV_OFFLOAD_PACKET) family = xfrm[i]->props.family; oif = fl->flowi_oif ? : fl->flowi_l3mdev; dst = xfrm_dst_lookup(xfrm[i], tos, oif, &saddr, &daddr, family, mark); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; } else dst_hold(dst); dst1->xfrm = xfrm[i]; xdst->xfrm_genid = xfrm[i]->genid; dst1->obsolete = DST_OBSOLETE_FORCE_CHK; dst1->lastuse = now; dst1->input = dst_discard; rcu_read_lock(); afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); if (likely(afinfo)) dst1->output = afinfo->output; else dst1->output = dst_discard_out; rcu_read_unlock(); xdst_prev = xdst; header_len += xfrm[i]->props.header_len; if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT) nfheader_len += xfrm[i]->props.header_len; trailer_len += xfrm[i]->props.trailer_len; } xfrm_dst_set_child(xdst_prev, dst); xdst0->path = dst; err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst; xfrm_init_path(xdst0, dst, nfheader_len); xfrm_init_pmtu(bundle, nx); for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst; xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) { err = xfrm_fill_dst(xdst_prev, dev, fl); if (err) goto free_dst; xdst_prev->u.dst.header_len = header_len; xdst_prev->u.dst.trailer_len = trailer_len; header_len -= xdst_prev->u.dst.xfrm->props.header_len; trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len; } return &xdst0->u.dst; put_states: for (; i < nx; i++) xfrm_state_put(xfrm[i]); free_dst: if (xdst0) dst_release_immediate(&xdst0->u.dst); return ERR_PTR(err); } static int xfrm_expand_policies(const struct flowi *fl, u16 family, struct xfrm_policy **pols, int *num_pols, int *num_xfrms) { int i; if (*num_pols == 0 || !pols[0]) { *num_pols = 0; *num_xfrms = 0; return 0; } if (IS_ERR(pols[0])) { *num_pols = 0; return PTR_ERR(pols[0]); } *num_xfrms = pols[0]->xfrm_nr; #ifdef CONFIG_XFRM_SUB_POLICY if (pols[0]->action == XFRM_POLICY_ALLOW && pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]), XFRM_POLICY_TYPE_MAIN, fl, family, XFRM_POLICY_OUT, pols[0]->if_id); if (pols[1]) { if (IS_ERR(pols[1])) { xfrm_pols_put(pols, *num_pols); *num_pols = 0; return PTR_ERR(pols[1]); } (*num_pols)++; (*num_xfrms) += pols[1]->xfrm_nr; } } #endif for (i = 0; i < *num_pols; i++) { if (pols[i]->action != XFRM_POLICY_ALLOW) { *num_xfrms = -1; break; } } return 0; } static struct xfrm_dst * xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, const struct flowi *fl, u16 family, struct dst_entry *dst_orig) { struct net *net = xp_net(pols[0]); struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct xfrm_dst *xdst; struct dst_entry *dst; int err; /* Try to instantiate a bundle */ err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); if (err <= 0) { if (err == 0) return NULL; if (err != -EAGAIN) XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); return ERR_PTR(err); } dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig); if (IS_ERR(dst)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); return ERR_CAST(dst); } xdst = (struct xfrm_dst *)dst; xdst->num_xfrms = err; xdst->num_pols = num_pols; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); xdst->policy_genid = atomic_read(&pols[0]->genid); return xdst; } static void xfrm_policy_queue_process(struct timer_list *t) { struct sk_buff *skb; struct sock *sk; struct dst_entry *dst; struct xfrm_policy *pol = from_timer(pol, t, polq.hold_timer); struct net *net = xp_net(pol); struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; struct sk_buff_head list; __u32 skb_mark; spin_lock(&pq->hold_queue.lock); skb = skb_peek(&pq->hold_queue); if (!skb) { spin_unlock(&pq->hold_queue.lock); goto out; } dst = skb_dst(skb); sk = skb->sk; /* Fixup the mark to support VTI. */ skb_mark = skb->mark; skb->mark = pol->mark.v; xfrm_decode_session(net, skb, &fl, dst->ops->family); skb->mark = skb_mark; spin_unlock(&pq->hold_queue.lock); dst_hold(xfrm_dst_path(dst)); dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) goto purge_queue; if (dst->flags & DST_XFRM_QUEUE) { dst_release(dst); if (pq->timeout >= XFRM_QUEUE_TMO_MAX) goto purge_queue; pq->timeout = pq->timeout << 1; if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout)) xfrm_pol_hold(pol); goto out; } dst_release(dst); __skb_queue_head_init(&list); spin_lock(&pq->hold_queue.lock); pq->timeout = 0; skb_queue_splice_init(&pq->hold_queue, &list); spin_unlock(&pq->hold_queue.lock); while (!skb_queue_empty(&list)) { skb = __skb_dequeue(&list); /* Fixup the mark to support VTI. */ skb_mark = skb->mark; skb->mark = pol->mark.v; xfrm_decode_session(net, skb, &fl, skb_dst(skb)->ops->family); skb->mark = skb_mark; dst_hold(xfrm_dst_path(skb_dst(skb))); dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0); if (IS_ERR(dst)) { kfree_skb(skb); continue; } nf_reset_ct(skb); skb_dst_drop(skb); skb_dst_set(skb, dst); dst_output(net, skb->sk, skb); } out: xfrm_pol_put(pol); return; purge_queue: pq->timeout = 0; skb_queue_purge(&pq->hold_queue); xfrm_pol_put(pol); } static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned long sched_next; struct dst_entry *dst = skb_dst(skb); struct xfrm_dst *xdst = (struct xfrm_dst *) dst; struct xfrm_policy *pol = xdst->pols[0]; struct xfrm_policy_queue *pq = &pol->polq; if (unlikely(skb_fclone_busy(sk, skb))) { kfree_skb(skb); return 0; } if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) { kfree_skb(skb); return -EAGAIN; } skb_dst_force(skb); spin_lock_bh(&pq->hold_queue.lock); if (!pq->timeout) pq->timeout = XFRM_QUEUE_TMO_MIN; sched_next = jiffies + pq->timeout; if (del_timer(&pq->hold_timer)) { if (time_before(pq->hold_timer.expires, sched_next)) sched_next = pq->hold_timer.expires; xfrm_pol_put(pol); } __skb_queue_tail(&pq->hold_queue, skb); if (!mod_timer(&pq->hold_timer, sched_next)) xfrm_pol_hold(pol); spin_unlock_bh(&pq->hold_queue.lock); return 0; } static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, struct xfrm_flo *xflo, const struct flowi *fl, int num_xfrms, u16 family) { int err; struct net_device *dev; struct dst_entry *dst; struct dst_entry *dst1; struct xfrm_dst *xdst; xdst = xfrm_alloc_dst(net, family); if (IS_ERR(xdst)) return xdst; if (!(xflo->flags & XFRM_LOOKUP_QUEUE) || net->xfrm.sysctl_larval_drop || num_xfrms <= 0) return xdst; dst = xflo->dst_orig; dst1 = &xdst->u.dst; dst_hold(dst); xdst->route = dst; dst_copy_metrics(dst1, dst); dst1->obsolete = DST_OBSOLETE_FORCE_CHK; dst1->flags |= DST_XFRM_QUEUE; dst1->lastuse = jiffies; dst1->input = dst_discard; dst1->output = xdst_queue_output; dst_hold(dst); xfrm_dst_set_child(xdst, dst); xdst->path = dst; xfrm_init_path((struct xfrm_dst *)dst1, dst, 0); err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst; err = xfrm_fill_dst(xdst, dev, fl); if (err) goto free_dst; out: return xdst; free_dst: dst_release(dst1); xdst = ERR_PTR(err); goto out; } static struct xfrm_dst *xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols = 0, num_xfrms = 0, err; struct xfrm_dst *xdst; /* Resolve policies to use if we couldn't get them from * previous cache entry */ num_pols = 1; pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) goto inc_error; if (num_pols == 0) return NULL; if (num_xfrms <= 0) goto make_dummy_bundle; xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, xflo->dst_orig); if (IS_ERR(xdst)) { err = PTR_ERR(xdst); if (err == -EREMOTE) { xfrm_pols_put(pols, num_pols); return NULL; } if (err != -EAGAIN) goto error; goto make_dummy_bundle; } else if (xdst == NULL) { num_xfrms = 0; goto make_dummy_bundle; } return xdst; make_dummy_bundle: /* We found policies, but there's no bundles to instantiate: * either because the policy blocks, has no transformations or * we could not build template (no xfrm_states).*/ xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); return ERR_CAST(xdst); } xdst->num_pols = num_pols; xdst->num_xfrms = num_xfrms; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); return xdst; inc_error: XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); error: xfrm_pols_put(pols, num_pols); return ERR_PTR(err); } static struct dst_entry *make_blackhole(struct net *net, u16 family, struct dst_entry *dst_orig) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); struct dst_entry *ret; if (!afinfo) { dst_release(dst_orig); return ERR_PTR(-EINVAL); } else { ret = afinfo->blackhole_route(net, dst_orig); } rcu_read_unlock(); return ret; } /* Finds/creates a bundle for given flow and if_id * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. * * xfrm_lookup uses an if_id of 0 by default, and is provided for * compatibility */ struct dst_entry *xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; struct xfrm_dst *xdst; struct dst_entry *dst, *route; u16 family = dst_orig->ops->family; u8 dir = XFRM_POLICY_OUT; int i, err, num_pols, num_xfrms = 0, drop_pols = 0; dst = NULL; xdst = NULL; route = NULL; sk = sk_const_to_full_sk(sk); if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { num_pols = 1; pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family, if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) goto dropdst; if (num_pols) { if (num_xfrms <= 0) { drop_pols = num_pols; goto no_transform; } xdst = xfrm_resolve_and_create_bundle( pols, num_pols, fl, family, dst_orig); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); err = PTR_ERR(xdst); if (err == -EREMOTE) goto nopol; goto dropdst; } else if (xdst == NULL) { num_xfrms = 0; drop_pols = num_pols; goto no_transform; } route = xdst->route; } } if (xdst == NULL) { struct xfrm_flo xflo; xflo.dst_orig = dst_orig; xflo.flags = flags; /* To accelerate a bit... */ if (!if_id && ((dst_orig->flags & DST_NOXFRM) || !net->xfrm.policy_count[XFRM_POLICY_OUT])) goto nopol; xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id); if (xdst == NULL) goto nopol; if (IS_ERR(xdst)) { err = PTR_ERR(xdst); goto dropdst; } num_pols = xdst->num_pols; num_xfrms = xdst->num_xfrms; memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols); route = xdst->route; } dst = &xdst->u.dst; if (route == NULL && num_xfrms > 0) { /* The only case when xfrm_bundle_lookup() returns a * bundle with null route, is when the template could * not be resolved. It means policies are there, but * bundle could not be created, since we don't yet * have the xfrm_state's. We need to wait for KM to * negotiate new SA's or bail out with error.*/ if (net->xfrm.sysctl_larval_drop) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); err = -EREMOTE; goto error; } err = -EAGAIN; XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); goto error; } no_transform: if (num_pols == 0) goto nopol; if ((flags & XFRM_LOOKUP_ICMP) && !(pols[0]->flags & XFRM_POLICY_ICMP)) { err = -ENOENT; goto error; } for (i = 0; i < num_pols; i++) WRITE_ONCE(pols[i]->curlft.use_time, ktime_get_real_seconds()); if (num_xfrms < 0) { /* Prohibit the flow */ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK); err = -EPERM; goto error; } else if (num_xfrms > 0) { /* Flow transformed */ dst_release(dst_orig); } else { /* Flow passes untransformed */ dst_release(dst); dst = dst_orig; } ok: xfrm_pols_put(pols, drop_pols); if (dst && dst->xfrm && dst->xfrm->props.mode == XFRM_MODE_TUNNEL) dst->flags |= DST_XFRM_TUNNEL; return dst; nopol: if ((!dst_orig->dev || !(dst_orig->dev->flags & IFF_LOOPBACK)) && net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) { err = -EPERM; goto error; } if (!(flags & XFRM_LOOKUP_ICMP)) { dst = dst_orig; goto ok; } err = -ENOENT; error: dst_release(dst); dropdst: if (!(flags & XFRM_LOOKUP_KEEP_DST_REF)) dst_release(dst_orig); xfrm_pols_put(pols, drop_pols); return ERR_PTR(err); } EXPORT_SYMBOL(xfrm_lookup_with_ifid); /* Main function: finds/creates a bundle for given flow. * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. */ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0); } EXPORT_SYMBOL(xfrm_lookup); /* Callers of xfrm_lookup_route() must ensure a call to dst_output(). * Otherwise we may send out blackholed packets. */ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, flags | XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_KEEP_DST_REF); if (PTR_ERR(dst) == -EREMOTE) return make_blackhole(net, dst_orig->ops->family, dst_orig); if (IS_ERR(dst)) dst_release(dst_orig); return dst; } EXPORT_SYMBOL(xfrm_lookup_route); static inline int xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) { struct sec_path *sp = skb_sec_path(skb); struct xfrm_state *x; if (!sp || idx < 0 || idx >= sp->len) return 0; x = sp->xvec[idx]; if (!x->type->reject) return 0; return x->type->reject(x, skb, fl); } /* When skb is transformed back to its "native" form, we have to * check policy restrictions. At the moment we make this in maximally * stupid way. Shame on me. :-) Of course, connected sockets must * have policy cached at them. */ static inline int xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, unsigned short family, u32 if_id) { if (xfrm_state_kern(x)) return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family); return x->id.proto == tmpl->id.proto && (x->id.spi == tmpl->id.spi || !tmpl->id.spi) && (x->props.reqid == tmpl->reqid || !tmpl->reqid) && x->props.mode == tmpl->mode && (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) || !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) && !(x->props.mode != XFRM_MODE_TRANSPORT && xfrm_state_addr_cmp(tmpl, x, family)) && (if_id == 0 || if_id == x->if_id); } /* * 0 or more than 0 is returned when validation is succeeded (either bypass * because of optional transport mode, or next index of the matched secpath * state with the template. * -1 is returned when no matching template is found. * Otherwise "-2 - errored_index" is returned. */ static inline int xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start, unsigned short family, u32 if_id) { int idx = start; if (tmpl->optional) { if (tmpl->mode == XFRM_MODE_TRANSPORT) return start; } else start = -1; for (; idx < sp->len; idx++) { if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id)) return ++idx; if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) { if (idx < sp->verified_cnt) { /* Secpath entry previously verified, consider optional and * continue searching */ continue; } if (start == -1) start = -2-idx; break; } } return start; } static void decode_session4(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reverse) { struct flowi4 *fl4 = &fl->u.ip4; memset(fl4, 0, sizeof(struct flowi4)); if (reverse) { fl4->saddr = flkeys->addrs.ipv4.dst; fl4->daddr = flkeys->addrs.ipv4.src; fl4->fl4_sport = flkeys->ports.dst; fl4->fl4_dport = flkeys->ports.src; } else { fl4->saddr = flkeys->addrs.ipv4.src; fl4->daddr = flkeys->addrs.ipv4.dst; fl4->fl4_sport = flkeys->ports.src; fl4->fl4_dport = flkeys->ports.dst; } switch (flkeys->basic.ip_proto) { case IPPROTO_GRE: fl4->fl4_gre_key = flkeys->gre.keyid; break; case IPPROTO_ICMP: fl4->fl4_icmp_type = flkeys->icmp.type; fl4->fl4_icmp_code = flkeys->icmp.code; break; } fl4->flowi4_proto = flkeys->basic.ip_proto; fl4->flowi4_tos = flkeys->ip.tos & ~INET_ECN_MASK; } #if IS_ENABLED(CONFIG_IPV6) static void decode_session6(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reverse) { struct flowi6 *fl6 = &fl->u.ip6; memset(fl6, 0, sizeof(struct flowi6)); if (reverse) { fl6->saddr = flkeys->addrs.ipv6.dst; fl6->daddr = flkeys->addrs.ipv6.src; fl6->fl6_sport = flkeys->ports.dst; fl6->fl6_dport = flkeys->ports.src; } else { fl6->saddr = flkeys->addrs.ipv6.src; fl6->daddr = flkeys->addrs.ipv6.dst; fl6->fl6_sport = flkeys->ports.src; fl6->fl6_dport = flkeys->ports.dst; } switch (flkeys->basic.ip_proto) { case IPPROTO_GRE: fl6->fl6_gre_key = flkeys->gre.keyid; break; case IPPROTO_ICMPV6: fl6->fl6_icmp_type = flkeys->icmp.type; fl6->fl6_icmp_code = flkeys->icmp.code; break; } fl6->flowi6_proto = flkeys->basic.ip_proto; } #endif int __xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse) { struct xfrm_flow_keys flkeys; memset(&flkeys, 0, sizeof(flkeys)); __skb_flow_dissect(net, skb, &xfrm_session_dissector, &flkeys, NULL, 0, 0, 0, FLOW_DISSECTOR_F_STOP_AT_ENCAP); switch (family) { case AF_INET: decode_session4(&flkeys, fl, reverse); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: decode_session6(&flkeys, fl, reverse); break; #endif default: return -EAFNOSUPPORT; } fl->flowi_mark = skb->mark; if (reverse) { fl->flowi_oif = skb->skb_iif; } else { int oif = 0; if (skb_dst(skb) && skb_dst(skb)->dev) oif = skb_dst(skb)->dev->ifindex; fl->flowi_oif = oif; } return security_xfrm_decode_session(skb, &fl->flowi_secid); } EXPORT_SYMBOL(__xfrm_decode_session); static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp) { for (; k < sp->len; k++) { if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) { *idxp = k; return 1; } } return 0; } static bool icmp_err_packet(const struct flowi *fl, unsigned short family) { const struct flowi4 *fl4 = &fl->u.ip4; if (family == AF_INET && fl4->flowi4_proto == IPPROTO_ICMP && (fl4->fl4_icmp_type == ICMP_DEST_UNREACH || fl4->fl4_icmp_type == ICMP_TIME_EXCEEDED)) return true; #if IS_ENABLED(CONFIG_IPV6) if (family == AF_INET6) { const struct flowi6 *fl6 = &fl->u.ip6; if (fl6->flowi6_proto == IPPROTO_ICMPV6 && (fl6->fl6_icmp_type == ICMPV6_DEST_UNREACH || fl6->fl6_icmp_type == ICMPV6_PKT_TOOBIG || fl6->fl6_icmp_type == ICMPV6_TIME_EXCEED)) return true; } #endif return false; } static bool xfrm_icmp_flow_decode(struct sk_buff *skb, unsigned short family, const struct flowi *fl, struct flowi *fl1) { bool ret = true; struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); int hl = family == AF_INET ? (sizeof(struct iphdr) + sizeof(struct icmphdr)) : (sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr)); if (!newskb) return true; if (!pskb_pull(newskb, hl)) goto out; skb_reset_network_header(newskb); if (xfrm_decode_session_reverse(dev_net(skb->dev), newskb, fl1, family) < 0) goto out; fl1->flowi_oif = fl->flowi_oif; fl1->flowi_mark = fl->flowi_mark; fl1->flowi_tos = fl->flowi_tos; nf_nat_decode_session(newskb, fl1, family); ret = false; out: consume_skb(newskb); return ret; } static bool xfrm_selector_inner_icmp_match(struct sk_buff *skb, unsigned short family, const struct xfrm_selector *sel, const struct flowi *fl) { bool ret = false; if (icmp_err_packet(fl, family)) { struct flowi fl1; if (xfrm_icmp_flow_decode(skb, family, fl, &fl1)) return ret; ret = xfrm_selector_match(sel, &fl1, family); } return ret; } static inline struct xfrm_policy *xfrm_in_fwd_icmp(struct sk_buff *skb, const struct flowi *fl, unsigned short family, u32 if_id) { struct xfrm_policy *pol = NULL; if (icmp_err_packet(fl, family)) { struct flowi fl1; struct net *net = dev_net(skb->dev); if (xfrm_icmp_flow_decode(skb, family, fl, &fl1)) return pol; pol = xfrm_policy_lookup(net, &fl1, family, XFRM_POLICY_FWD, if_id); if (IS_ERR(pol)) pol = NULL; } return pol; } static inline struct dst_entry *xfrm_out_fwd_icmp(struct sk_buff *skb, struct flowi *fl, unsigned short family, struct dst_entry *dst) { if (icmp_err_packet(fl, family)) { struct net *net = dev_net(skb->dev); struct dst_entry *dst2; struct flowi fl1; if (xfrm_icmp_flow_decode(skb, family, fl, &fl1)) return dst; dst_hold(dst); dst2 = xfrm_lookup(net, dst, &fl1, NULL, (XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_ICMP)); if (IS_ERR(dst2)) return dst; if (dst2->xfrm) { dst_release(dst); dst = dst2; } else { dst_release(dst2); } } return dst; } int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); struct xfrm_policy *pol; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int npols = 0; int xfrm_nr; int pi; int reverse; struct flowi fl; int xerr_idx = -1; const struct xfrm_if_cb *ifcb; struct sec_path *sp; u32 if_id = 0; rcu_read_lock(); ifcb = xfrm_if_get_cb(); if (ifcb) { struct xfrm_if_decode_session_result r; if (ifcb->decode_session(skb, family, &r)) { if_id = r.if_id; net = r.net; } } rcu_read_unlock(); reverse = dir & ~XFRM_POLICY_MASK; dir &= XFRM_POLICY_MASK; if (__xfrm_decode_session(net, skb, &fl, family, reverse) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); return 0; } nf_nat_decode_session(skb, &fl, family); /* First, check used SA against their selectors. */ sp = skb_sec_path(skb); if (sp) { int i; for (i = sp->len - 1; i >= 0; i--) { struct xfrm_state *x = sp->xvec[i]; int ret = 0; if (!xfrm_selector_match(&x->sel, &fl, family)) { ret = 1; if (x->props.flags & XFRM_STATE_ICMP && xfrm_selector_inner_icmp_match(skb, family, &x->sel, &fl)) ret = 0; if (ret) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); return 0; } } } } pol = NULL; sk = sk_to_full_sk(sk); if (sk && sk->sk_policy[dir]) { pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; } } if (!pol) pol = xfrm_policy_lookup(net, &fl, family, dir, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; } if (!pol && dir == XFRM_POLICY_FWD) pol = xfrm_in_fwd_icmp(skb, &fl, family, if_id); if (!pol) { if (net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); return 0; } if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) { xfrm_secpath_reject(xerr_idx, skb, &fl); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); return 0; } return 1; } /* This lockless write can happen from different cpus. */ WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds()); pols[0] = pol; npols++; #ifdef CONFIG_XFRM_SUB_POLICY if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, &fl, family, XFRM_POLICY_IN, if_id); if (pols[1]) { if (IS_ERR(pols[1])) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); xfrm_pol_put(pols[0]); return 0; } /* This write can happen from different cpus. */ WRITE_ONCE(pols[1]->curlft.use_time, ktime_get_real_seconds()); npols++; } } #endif if (pol->action == XFRM_POLICY_ALLOW) { static struct sec_path dummy; struct xfrm_tmpl *tp[XFRM_MAX_DEPTH]; struct xfrm_tmpl *stp[XFRM_MAX_DEPTH]; struct xfrm_tmpl **tpp = tp; int ti = 0; int i, k; sp = skb_sec_path(skb); if (!sp) sp = &dummy; for (pi = 0; pi < npols; pi++) { if (pols[pi] != pol && pols[pi]->action != XFRM_POLICY_ALLOW) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); goto reject; } if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto reject_error; } for (i = 0; i < pols[pi]->xfrm_nr; i++) tpp[ti++] = &pols[pi]->xfrm_vec[i]; } xfrm_nr = ti; if (npols > 1) { xfrm_tmpl_sort(stp, tpp, xfrm_nr, family); tpp = stp; } /* For each tunnel xfrm, find the first matching tmpl. * For each tmpl before that, find corresponding xfrm. * Order is _important_. Later we will implement * some barriers, but at the moment barriers * are implied between each two transformations. * Upon success, marks secpath entries as having been * verified to allow them to be skipped in future policy * checks (e.g. nested tunnels). */ for (i = xfrm_nr-1, k = 0; i >= 0; i--) { k = xfrm_policy_ok(tpp[i], sp, k, family, if_id); if (k < 0) { if (k < -1) /* "-2 - errored_index" returned */ xerr_idx = -(2+k); XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH); goto reject; } } if (secpath_has_nontransport(sp, k, &xerr_idx)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH); goto reject; } xfrm_pols_put(pols, npols); sp->verified_cnt = k; return 1; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); reject: xfrm_secpath_reject(xerr_idx, skb, &fl); reject_error: xfrm_pols_put(pols, npols); return 0; } EXPORT_SYMBOL(__xfrm_policy_check); int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); struct flowi fl; struct dst_entry *dst; int res = 1; if (xfrm_decode_session(net, skb, &fl, family) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } skb_dst_force(skb); if (!skb_dst(skb)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { res = 0; dst = NULL; } if (dst && !dst->xfrm) dst = xfrm_out_fwd_icmp(skb, &fl, family, dst); skb_dst_set(skb, dst); return res; } EXPORT_SYMBOL(__xfrm_route_forward); /* Optimize later using cookies and generation ids. */ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) { /* Code (such as __xfrm4_bundle_create()) sets dst->obsolete * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to * get validated by dst_ops->check on every use. We do this * because when a normal route referenced by an XFRM dst is * obsoleted we do not go looking around for all parent * referencing XFRM dsts so that we can invalidate them. It * is just too much work. Instead we make the checks here on * every use. For example: * * XFRM dst A --> IPv4 dst X * * X is the "xdst->route" of A (X is also the "dst->path" of A * in this example). If X is marked obsolete, "A" will not * notice. That's what we are validating here via the * stale_bundle() check. * * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will * be marked on it. * This will force stale_bundle() to fail on any xdst bundle with * this dst linked in it. */ if (dst->obsolete < 0 && !stale_bundle(dst)) return dst; return NULL; } static int stale_bundle(struct dst_entry *dst) { return !xfrm_bundle_ok((struct xfrm_dst *)dst); } void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) { dst->dev = blackhole_netdev; dev_hold(dst->dev); dev_put(dev); } } EXPORT_SYMBOL(xfrm_dst_ifdown); static void xfrm_link_failure(struct sk_buff *skb) { /* Impossible. Such dst must be popped before reaches point of failure. */ } static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst) { if (dst->obsolete) sk_dst_reset(sk); } static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr) { while (nr--) { struct xfrm_dst *xdst = bundle[nr]; u32 pmtu, route_mtu_cached; struct dst_entry *dst; dst = &xdst->u.dst; pmtu = dst_mtu(xfrm_dst_child(dst)); xdst->child_mtu_cached = pmtu; pmtu = xfrm_state_mtu(dst->xfrm, pmtu); route_mtu_cached = dst_mtu(xdst->route); xdst->route_mtu_cached = route_mtu_cached; if (pmtu > route_mtu_cached) pmtu = route_mtu_cached; dst_metric_set(dst, RTAX_MTU, pmtu); } } /* Check that the bundle accepts the flow and its components are * still valid. */ static int xfrm_bundle_ok(struct xfrm_dst *first) { struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct dst_entry *dst = &first->u.dst; struct xfrm_dst *xdst; int start_from, nr; u32 mtu; if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) || (dst->dev && !netif_running(dst->dev))) return 0; if (dst->flags & DST_XFRM_QUEUE) return 1; start_from = nr = 0; do { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; if (dst->xfrm->km.state != XFRM_STATE_VALID) return 0; if (xdst->xfrm_genid != dst->xfrm->genid) return 0; if (xdst->num_pols > 0 && xdst->policy_genid != atomic_read(&xdst->pols[0]->genid)) return 0; bundle[nr++] = xdst; mtu = dst_mtu(xfrm_dst_child(dst)); if (xdst->child_mtu_cached != mtu) { start_from = nr; xdst->child_mtu_cached = mtu; } if (!dst_check(xdst->route, xdst->route_cookie)) return 0; mtu = dst_mtu(xdst->route); if (xdst->route_mtu_cached != mtu) { start_from = nr; xdst->route_mtu_cached = mtu; } dst = xfrm_dst_child(dst); } while (dst->xfrm); if (likely(!start_from)) return 1; xdst = bundle[start_from - 1]; mtu = xdst->child_mtu_cached; while (start_from--) { dst = &xdst->u.dst; mtu = xfrm_state_mtu(dst->xfrm, mtu); if (mtu > xdst->route_mtu_cached) mtu = xdst->route_mtu_cached; dst_metric_set(dst, RTAX_MTU, mtu); if (!start_from) break; xdst = bundle[start_from - 1]; xdst->child_mtu_cached = mtu; } return 1; } static unsigned int xfrm_default_advmss(const struct dst_entry *dst) { return dst_metric_advmss(xfrm_dst_path(dst)); } static unsigned int xfrm_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); return mtu ? : dst_mtu(xfrm_dst_path(dst)); } static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, const void *daddr) { while (dst->xfrm) { const struct xfrm_state *xfrm = dst->xfrm; dst = xfrm_dst_child(dst); if (xfrm->props.mode == XFRM_MODE_TRANSPORT) continue; if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR) daddr = xfrm->coaddr; else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR)) daddr = &xfrm->id.daddr; } return daddr; } static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct dst_entry *path = xfrm_dst_path(dst); if (!skb) daddr = xfrm_get_dst_nexthop(dst, daddr); return path->ops->neigh_lookup(path, skb, daddr); } static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct dst_entry *path = xfrm_dst_path(dst); daddr = xfrm_get_dst_nexthop(dst, daddr); path->ops->confirm_neigh(path, daddr); } int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family) { int err = 0; if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo))) return -EAFNOSUPPORT; spin_lock(&xfrm_policy_afinfo_lock); if (unlikely(xfrm_policy_afinfo[family] != NULL)) err = -EEXIST; else { struct dst_ops *dst_ops = afinfo->dst_ops; if (likely(dst_ops->kmem_cachep == NULL)) dst_ops->kmem_cachep = xfrm_dst_cache; if (likely(dst_ops->check == NULL)) dst_ops->check = xfrm_dst_check; if (likely(dst_ops->default_advmss == NULL)) dst_ops->default_advmss = xfrm_default_advmss; if (likely(dst_ops->mtu == NULL)) dst_ops->mtu = xfrm_mtu; if (likely(dst_ops->negative_advice == NULL)) dst_ops->negative_advice = xfrm_negative_advice; if (likely(dst_ops->link_failure == NULL)) dst_ops->link_failure = xfrm_link_failure; if (likely(dst_ops->neigh_lookup == NULL)) dst_ops->neigh_lookup = xfrm_neigh_lookup; if (likely(!dst_ops->confirm_neigh)) dst_ops->confirm_neigh = xfrm_confirm_neigh; rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo); } spin_unlock(&xfrm_policy_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_policy_register_afinfo); void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo) { struct dst_ops *dst_ops = afinfo->dst_ops; int i; for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) { if (xfrm_policy_afinfo[i] != afinfo) continue; RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL); break; } synchronize_rcu(); dst_ops->kmem_cachep = NULL; dst_ops->check = NULL; dst_ops->negative_advice = NULL; dst_ops->link_failure = NULL; } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb) { spin_lock(&xfrm_if_cb_lock); rcu_assign_pointer(xfrm_if_cb, ifcb); spin_unlock(&xfrm_if_cb_lock); } EXPORT_SYMBOL(xfrm_if_register_cb); void xfrm_if_unregister_cb(void) { RCU_INIT_POINTER(xfrm_if_cb, NULL); synchronize_rcu(); } EXPORT_SYMBOL(xfrm_if_unregister_cb); #ifdef CONFIG_XFRM_STATISTICS static int __net_init xfrm_statistics_init(struct net *net) { int rv; net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib); if (!net->mib.xfrm_statistics) return -ENOMEM; rv = xfrm_proc_init(net); if (rv < 0) free_percpu(net->mib.xfrm_statistics); return rv; } static void xfrm_statistics_fini(struct net *net) { xfrm_proc_fini(net); free_percpu(net->mib.xfrm_statistics); } #else static int __net_init xfrm_statistics_init(struct net *net) { return 0; } static void xfrm_statistics_fini(struct net *net) { } #endif static int __net_init xfrm_policy_init(struct net *net) { unsigned int hmask, sz; int dir, err; if (net_eq(net, &init_net)) { xfrm_dst_cache = KMEM_CACHE(xfrm_dst, SLAB_HWCACHE_ALIGN | SLAB_PANIC); err = rhashtable_init(&xfrm_policy_inexact_table, &xfrm_pol_inexact_params); BUG_ON(err); } hmask = 8 - 1; sz = (hmask+1) * sizeof(struct hlist_head); net->xfrm.policy_byidx = xfrm_hash_alloc(sz); if (!net->xfrm.policy_byidx) goto out_byidx; net->xfrm.policy_idx_hmask = hmask; for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; net->xfrm.policy_count[dir] = 0; net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0; INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); htab = &net->xfrm.policy_bydst[dir]; htab->table = xfrm_hash_alloc(sz); if (!htab->table) goto out_bydst; htab->hmask = hmask; htab->dbits4 = 32; htab->sbits4 = 32; htab->dbits6 = 128; htab->sbits6 = 128; } net->xfrm.policy_hthresh.lbits4 = 32; net->xfrm.policy_hthresh.rbits4 = 32; net->xfrm.policy_hthresh.lbits6 = 128; net->xfrm.policy_hthresh.rbits6 = 128; seqlock_init(&net->xfrm.policy_hthresh.lock); INIT_LIST_HEAD(&net->xfrm.policy_all); INIT_LIST_HEAD(&net->xfrm.inexact_bins); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); return 0; out_bydst: for (dir--; dir >= 0; dir--) { struct xfrm_policy_hash *htab; htab = &net->xfrm.policy_bydst[dir]; xfrm_hash_free(htab->table, sz); } xfrm_hash_free(net->xfrm.policy_byidx, sz); out_byidx: return -ENOMEM; } static void xfrm_policy_fini(struct net *net) { struct xfrm_pol_inexact_bin *b, *t; unsigned int sz; int dir; flush_work(&net->xfrm.policy_hash_work); #ifdef CONFIG_XFRM_SUB_POLICY xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false); #endif xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false); WARN_ON(!list_empty(&net->xfrm.policy_all)); for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir])); htab = &net->xfrm.policy_bydst[dir]; sz = (htab->hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(htab->table)); xfrm_hash_free(htab->table, sz); } sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(net->xfrm.policy_byidx)); xfrm_hash_free(net->xfrm.policy_byidx, sz); spin_lock_bh(&net->xfrm.xfrm_policy_lock); list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins) __xfrm_policy_inexact_prune_bin(b, true); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } static int __net_init xfrm_net_init(struct net *net) { int rv; /* Initialize the per-net locks here */ spin_lock_init(&net->xfrm.xfrm_state_lock); spin_lock_init(&net->xfrm.xfrm_policy_lock); seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock); mutex_init(&net->xfrm.xfrm_cfg_mutex); net->xfrm.policy_default[XFRM_POLICY_IN] = XFRM_USERPOLICY_ACCEPT; net->xfrm.policy_default[XFRM_POLICY_FWD] = XFRM_USERPOLICY_ACCEPT; net->xfrm.policy_default[XFRM_POLICY_OUT] = XFRM_USERPOLICY_ACCEPT; rv = xfrm_statistics_init(net); if (rv < 0) goto out_statistics; rv = xfrm_state_init(net); if (rv < 0) goto out_state; rv = xfrm_policy_init(net); if (rv < 0) goto out_policy; rv = xfrm_sysctl_init(net); if (rv < 0) goto out_sysctl; return 0; out_sysctl: xfrm_policy_fini(net); out_policy: xfrm_state_fini(net); out_state: xfrm_statistics_fini(net); out_statistics: return rv; } static void __net_exit xfrm_net_exit(struct net *net) { xfrm_sysctl_fini(net); xfrm_policy_fini(net); xfrm_state_fini(net); xfrm_statistics_fini(net); } static struct pernet_operations __net_initdata xfrm_net_ops = { .init = xfrm_net_init, .exit = xfrm_net_exit, }; static const struct flow_dissector_key xfrm_flow_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct xfrm_flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct xfrm_flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct xfrm_flow_keys, addrs.ipv4), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct xfrm_flow_keys, addrs.ipv6), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct xfrm_flow_keys, ports), }, { .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, .offset = offsetof(struct xfrm_flow_keys, gre), }, { .key_id = FLOW_DISSECTOR_KEY_IP, .offset = offsetof(struct xfrm_flow_keys, ip), }, { .key_id = FLOW_DISSECTOR_KEY_ICMP, .offset = offsetof(struct xfrm_flow_keys, icmp), }, }; void __init xfrm_init(void) { skb_flow_dissector_init(&xfrm_session_dissector, xfrm_flow_dissector_keys, ARRAY_SIZE(xfrm_flow_dissector_keys)); register_pernet_subsys(&xfrm_net_ops); xfrm_dev_init(); xfrm_input_init(); #ifdef CONFIG_XFRM_ESPINTCP espintcp_init(); #endif register_xfrm_state_bpf(); } #ifdef CONFIG_AUDITSYSCALL static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp, struct audit_buffer *audit_buf) { struct xfrm_sec_ctx *ctx = xp->security; struct xfrm_selector *sel = &xp->selector; if (ctx) audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s", ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str); switch (sel->family) { case AF_INET: audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4); if (sel->prefixlen_s != 32) audit_log_format(audit_buf, " src_prefixlen=%d", sel->prefixlen_s); audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4); if (sel->prefixlen_d != 32) audit_log_format(audit_buf, " dst_prefixlen=%d", sel->prefixlen_d); break; case AF_INET6: audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6); if (sel->prefixlen_s != 128) audit_log_format(audit_buf, " src_prefixlen=%d", sel->prefixlen_s); audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6); if (sel->prefixlen_d != 128) audit_log_format(audit_buf, " dst_prefixlen=%d", sel->prefixlen_d); break; } } void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SPD-add"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_policy_add); void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SPD-delete"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete); #endif #ifdef CONFIG_XFRM_MIGRATE static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp, const struct xfrm_selector *sel_tgt) { if (sel_cmp->proto == IPSEC_ULPROTO_ANY) { if (sel_tgt->family == sel_cmp->family && xfrm_addr_equal(&sel_tgt->daddr, &sel_cmp->daddr, sel_cmp->family) && xfrm_addr_equal(&sel_tgt->saddr, &sel_cmp->saddr, sel_cmp->family) && sel_tgt->prefixlen_d == sel_cmp->prefixlen_d && sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) { return true; } } else { if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) { return true; } } return false; } static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, u8 dir, u8 type, struct net *net, u32 if_id) { struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; u32 priority = ~0U; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); hlist_for_each_entry(pol, chain, bydst) { if ((if_id == 0 || pol->if_id == if_id) && xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; priority = ret->priority; break; } } chain = &net->xfrm.policy_inexact[dir]; hlist_for_each_entry(pol, chain, bydst_inexact_list) { if ((pol->priority >= priority) && ret) break; if ((if_id == 0 || pol->if_id == if_id) && xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; break; } } xfrm_pol_hold(ret); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return ret; } static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t) { int match = 0; if (t->mode == m->mode && t->id.proto == m->proto && (m->reqid == 0 || t->reqid == m->reqid)) { switch (t->mode) { case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr, m->old_family) && xfrm_addr_equal(&t->saddr, &m->old_saddr, m->old_family)) { match = 1; } break; case XFRM_MODE_TRANSPORT: /* in case of transport mode, template does not store any IP addresses, hence we just compare mode and protocol */ match = 1; break; default: break; } } return match; } /* update endpoint address(es) of template(s) */ static int xfrm_policy_migrate(struct xfrm_policy *pol, struct xfrm_migrate *m, int num_migrate, struct netlink_ext_ack *extack) { struct xfrm_migrate *mp; int i, j, n = 0; write_lock_bh(&pol->lock); if (unlikely(pol->walk.dead)) { /* target policy has been deleted */ NL_SET_ERR_MSG(extack, "Target policy not found"); write_unlock_bh(&pol->lock); return -ENOENT; } for (i = 0; i < pol->xfrm_nr; i++) { for (j = 0, mp = m; j < num_migrate; j++, mp++) { if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i])) continue; n++; if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL && pol->xfrm_vec[i].mode != XFRM_MODE_BEET) continue; /* update endpoints */ memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr, sizeof(pol->xfrm_vec[i].id.daddr)); memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr, sizeof(pol->xfrm_vec[i].saddr)); pol->xfrm_vec[i].encap_family = mp->new_family; /* flush bundles */ atomic_inc(&pol->genid); } } write_unlock_bh(&pol->lock); if (!n) return -ENODATA; return 0; } static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate, struct netlink_ext_ack *extack) { int i, j; if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH) { NL_SET_ERR_MSG(extack, "Invalid number of SAs to migrate, must be 0 < num <= XFRM_MAX_DEPTH (6)"); return -EINVAL; } for (i = 0; i < num_migrate; i++) { if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) || xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) { NL_SET_ERR_MSG(extack, "Addresses in the MIGRATE attribute's list cannot be null"); return -EINVAL; } /* check if there is any duplicated entry */ for (j = i + 1; j < num_migrate; j++) { if (!memcmp(&m[i].old_daddr, &m[j].old_daddr, sizeof(m[i].old_daddr)) && !memcmp(&m[i].old_saddr, &m[j].old_saddr, sizeof(m[i].old_saddr)) && m[i].proto == m[j].proto && m[i].mode == m[j].mode && m[i].reqid == m[j].reqid && m[i].old_family == m[j].old_family) { NL_SET_ERR_MSG(extack, "Entries in the MIGRATE attribute's list must be unique"); return -EINVAL; } } } return 0; } int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap, u32 if_id, struct netlink_ext_ack *extack) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; struct xfrm_state *x, *xc; struct xfrm_state *x_cur[XFRM_MAX_DEPTH]; struct xfrm_state *x_new[XFRM_MAX_DEPTH]; struct xfrm_migrate *mp; /* Stage 0 - sanity checks */ err = xfrm_migrate_check(m, num_migrate, extack); if (err < 0) goto out; if (dir >= XFRM_POLICY_MAX) { NL_SET_ERR_MSG(extack, "Invalid policy direction"); err = -EINVAL; goto out; } /* Stage 1 - find policy */ pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id); if (!pol) { NL_SET_ERR_MSG(extack, "Target policy not found"); err = -ENOENT; goto out; } /* Stage 2 - find and update state(s) */ for (i = 0, mp = m; i < num_migrate; i++, mp++) { if ((x = xfrm_migrate_state_find(mp, net, if_id))) { x_cur[nx_cur] = x; nx_cur++; xc = xfrm_state_migrate(x, mp, encap); if (xc) { x_new[nx_new] = xc; nx_new++; } else { err = -ENODATA; goto restore_state; } } } /* Stage 3 - update policy */ err = xfrm_policy_migrate(pol, m, num_migrate, extack); if (err < 0) goto restore_state; /* Stage 4 - delete old state(s) */ if (nx_cur) { xfrm_states_put(x_cur, nx_cur); xfrm_states_delete(x_cur, nx_cur); } /* Stage 5 - announce */ km_migrate(sel, dir, type, m, num_migrate, k, encap); xfrm_pol_put(pol); return 0; out: return err; restore_state: if (pol) xfrm_pol_put(pol); if (nx_cur) xfrm_states_put(x_cur, nx_cur); if (nx_new) xfrm_states_delete(x_new, nx_new); return err; } EXPORT_SYMBOL(xfrm_migrate); #endif
5 5 9 9 5 5 6 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM jbd2 #if !defined(_TRACE_JBD2_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_JBD2_H #include <linux/jbd2.h> #include <linux/tracepoint.h> struct transaction_chp_stats_s; struct transaction_run_stats_s; TRACE_EVENT(jbd2_checkpoint, TP_PROTO(journal_t *journal, int result), TP_ARGS(journal, result), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, result ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->result = result; ), TP_printk("dev %d,%d result %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->result) ); DECLARE_EVENT_CLASS(jbd2_commit, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction), TP_STRUCT__entry( __field( dev_t, dev ) __field( char, sync_commit ) __field( tid_t, transaction ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; ), TP_printk("dev %d,%d transaction %u sync %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->transaction, __entry->sync_commit) ); DEFINE_EVENT(jbd2_commit, jbd2_start_commit, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction) ); DEFINE_EVENT(jbd2_commit, jbd2_commit_locking, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction) ); DEFINE_EVENT(jbd2_commit, jbd2_commit_flushing, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction) ); DEFINE_EVENT(jbd2_commit, jbd2_commit_logging, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction) ); DEFINE_EVENT(jbd2_commit, jbd2_drop_transaction, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction) ); TRACE_EVENT(jbd2_end_commit, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction), TP_STRUCT__entry( __field( dev_t, dev ) __field( char, sync_commit ) __field( tid_t, transaction ) __field( tid_t, head ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; __entry->head = journal->j_tail_sequence; ), TP_printk("dev %d,%d transaction %u sync %d head %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->transaction, __entry->sync_commit, __entry->head) ); TRACE_EVENT(jbd2_submit_inode_data, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; ), TP_printk("dev %d,%d ino %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino) ); DECLARE_EVENT_CLASS(jbd2_handle_start_class, TP_PROTO(dev_t dev, tid_t tid, unsigned int type, unsigned int line_no, int requested_blocks), TP_ARGS(dev, tid, type, line_no, requested_blocks), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tid ) __field( unsigned int, type ) __field( unsigned int, line_no ) __field( int, requested_blocks) ), TP_fast_assign( __entry->dev = dev; __entry->tid = tid; __entry->type = type; __entry->line_no = line_no; __entry->requested_blocks = requested_blocks; ), TP_printk("dev %d,%d tid %u type %u line_no %u " "requested_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, __entry->type, __entry->line_no, __entry->requested_blocks) ); DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_start, TP_PROTO(dev_t dev, tid_t tid, unsigned int type, unsigned int line_no, int requested_blocks), TP_ARGS(dev, tid, type, line_no, requested_blocks) ); DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_restart, TP_PROTO(dev_t dev, tid_t tid, unsigned int type, unsigned int line_no, int requested_blocks), TP_ARGS(dev, tid, type, line_no, requested_blocks) ); TRACE_EVENT(jbd2_handle_extend, TP_PROTO(dev_t dev, tid_t tid, unsigned int type, unsigned int line_no, int buffer_credits, int requested_blocks), TP_ARGS(dev, tid, type, line_no, buffer_credits, requested_blocks), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tid ) __field( unsigned int, type ) __field( unsigned int, line_no ) __field( int, buffer_credits ) __field( int, requested_blocks) ), TP_fast_assign( __entry->dev = dev; __entry->tid = tid; __entry->type = type; __entry->line_no = line_no; __entry->buffer_credits = buffer_credits; __entry->requested_blocks = requested_blocks; ), TP_printk("dev %d,%d tid %u type %u line_no %u " "buffer_credits %d requested_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, __entry->type, __entry->line_no, __entry->buffer_credits, __entry->requested_blocks) ); TRACE_EVENT(jbd2_handle_stats, TP_PROTO(dev_t dev, tid_t tid, unsigned int type, unsigned int line_no, int interval, int sync, int requested_blocks, int dirtied_blocks), TP_ARGS(dev, tid, type, line_no, interval, sync, requested_blocks, dirtied_blocks), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tid ) __field( unsigned int, type ) __field( unsigned int, line_no ) __field( int, interval ) __field( int, sync ) __field( int, requested_blocks) __field( int, dirtied_blocks ) ), TP_fast_assign( __entry->dev = dev; __entry->tid = tid; __entry->type = type; __entry->line_no = line_no; __entry->interval = interval; __entry->sync = sync; __entry->requested_blocks = requested_blocks; __entry->dirtied_blocks = dirtied_blocks; ), TP_printk("dev %d,%d tid %u type %u line_no %u interval %d " "sync %d requested_blocks %d dirtied_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, __entry->type, __entry->line_no, __entry->interval, __entry->sync, __entry->requested_blocks, __entry->dirtied_blocks) ); TRACE_EVENT(jbd2_run_stats, TP_PROTO(dev_t dev, tid_t tid, struct transaction_run_stats_s *stats), TP_ARGS(dev, tid, stats), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tid ) __field( unsigned long, wait ) __field( unsigned long, request_delay ) __field( unsigned long, running ) __field( unsigned long, locked ) __field( unsigned long, flushing ) __field( unsigned long, logging ) __field( __u32, handle_count ) __field( __u32, blocks ) __field( __u32, blocks_logged ) ), TP_fast_assign( __entry->dev = dev; __entry->tid = tid; __entry->wait = stats->rs_wait; __entry->request_delay = stats->rs_request_delay; __entry->running = stats->rs_running; __entry->locked = stats->rs_locked; __entry->flushing = stats->rs_flushing; __entry->logging = stats->rs_logging; __entry->handle_count = stats->rs_handle_count; __entry->blocks = stats->rs_blocks; __entry->blocks_logged = stats->rs_blocks_logged; ), TP_printk("dev %d,%d tid %u wait %u request_delay %u running %u " "locked %u flushing %u logging %u handle_count %u " "blocks %u blocks_logged %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, jiffies_to_msecs(__entry->wait), jiffies_to_msecs(__entry->request_delay), jiffies_to_msecs(__entry->running), jiffies_to_msecs(__entry->locked), jiffies_to_msecs(__entry->flushing), jiffies_to_msecs(__entry->logging), __entry->handle_count, __entry->blocks, __entry->blocks_logged) ); TRACE_EVENT(jbd2_checkpoint_stats, TP_PROTO(dev_t dev, tid_t tid, struct transaction_chp_stats_s *stats), TP_ARGS(dev, tid, stats), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tid ) __field( unsigned long, chp_time ) __field( __u32, forced_to_close ) __field( __u32, written ) __field( __u32, dropped ) ), TP_fast_assign( __entry->dev = dev; __entry->tid = tid; __entry->chp_time = stats->cs_chp_time; __entry->forced_to_close= stats->cs_forced_to_close; __entry->written = stats->cs_written; __entry->dropped = stats->cs_dropped; ), TP_printk("dev %d,%d tid %u chp_time %u forced_to_close %u " "written %u dropped %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, jiffies_to_msecs(__entry->chp_time), __entry->forced_to_close, __entry->written, __entry->dropped) ); TRACE_EVENT(jbd2_update_log_tail, TP_PROTO(journal_t *journal, tid_t first_tid, unsigned long block_nr, unsigned long freed), TP_ARGS(journal, first_tid, block_nr, freed), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tail_sequence ) __field( tid_t, first_tid ) __field(unsigned long, block_nr ) __field(unsigned long, freed ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->tail_sequence = journal->j_tail_sequence; __entry->first_tid = first_tid; __entry->block_nr = block_nr; __entry->freed = freed; ), TP_printk("dev %d,%d from %u to %u offset %lu freed %lu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tail_sequence, __entry->first_tid, __entry->block_nr, __entry->freed) ); TRACE_EVENT(jbd2_write_superblock, TP_PROTO(journal_t *journal, blk_opf_t write_flags), TP_ARGS(journal, write_flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( blk_opf_t, write_flags ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->write_flags = write_flags; ), TP_printk("dev %d,%d write_flags %x", MAJOR(__entry->dev), MINOR(__entry->dev), (__force u32)__entry->write_flags) ); TRACE_EVENT(jbd2_lock_buffer_stall, TP_PROTO(dev_t dev, unsigned long stall_ms), TP_ARGS(dev, stall_ms), TP_STRUCT__entry( __field( dev_t, dev ) __field(unsigned long, stall_ms ) ), TP_fast_assign( __entry->dev = dev; __entry->stall_ms = stall_ms; ), TP_printk("dev %d,%d stall_ms %lu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->stall_ms) ); DECLARE_EVENT_CLASS(jbd2_journal_shrink, TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), TP_ARGS(journal, nr_to_scan, count), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long, nr_to_scan) __field(unsigned long, count) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->nr_to_scan = nr_to_scan; __entry->count = count; ), TP_printk("dev %d,%d nr_to_scan %lu count %lu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_to_scan, __entry->count) ); DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_count, TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), TP_ARGS(journal, nr_to_scan, count) ); DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_scan_enter, TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), TP_ARGS(journal, nr_to_scan, count) ); TRACE_EVENT(jbd2_shrink_scan_exit, TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long nr_shrunk, unsigned long count), TP_ARGS(journal, nr_to_scan, nr_shrunk, count), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long, nr_to_scan) __field(unsigned long, nr_shrunk) __field(unsigned long, count) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->nr_to_scan = nr_to_scan; __entry->nr_shrunk = nr_shrunk; __entry->count = count; ), TP_printk("dev %d,%d nr_to_scan %lu nr_shrunk %lu count %lu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_to_scan, __entry->nr_shrunk, __entry->count) ); TRACE_EVENT(jbd2_shrink_checkpoint_list, TP_PROTO(journal_t *journal, tid_t first_tid, tid_t tid, tid_t last_tid, unsigned long nr_freed, tid_t next_tid), TP_ARGS(journal, first_tid, tid, last_tid, nr_freed, next_tid), TP_STRUCT__entry( __field(dev_t, dev) __field(tid_t, first_tid) __field(tid_t, tid) __field(tid_t, last_tid) __field(unsigned long, nr_freed) __field(tid_t, next_tid) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->first_tid = first_tid; __entry->tid = tid; __entry->last_tid = last_tid; __entry->nr_freed = nr_freed; __entry->next_tid = next_tid; ), TP_printk("dev %d,%d shrink transaction %u-%u(%u) freed %lu " "next transaction %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->first_tid, __entry->tid, __entry->last_tid, __entry->nr_freed, __entry->next_tid) ); #endif /* _TRACE_JBD2_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
233 235 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef LINUX_RESUME_USER_MODE_H #define LINUX_RESUME_USER_MODE_H #include <linux/sched.h> #include <linux/task_work.h> #include <linux/memcontrol.h> #include <linux/rseq.h> #include <linux/blk-cgroup.h> /** * set_notify_resume - cause resume_user_mode_work() to be called * @task: task that will call resume_user_mode_work() * * Calling this arranges that @task will call resume_user_mode_work() * before returning to user mode. If it's already running in user mode, * it will enter the kernel and call resume_user_mode_work() soon. * If it's blocked, it will not be woken. */ static inline void set_notify_resume(struct task_struct *task) { if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME)) kick_process(task); } /** * resume_user_mode_work - Perform work before returning to user mode * @regs: user-mode registers of @current task * * This is called when %TIF_NOTIFY_RESUME has been set. Now we are * about to return to user mode, and the user state in @regs can be * inspected or adjusted. The caller in arch code has cleared * %TIF_NOTIFY_RESUME before the call. If the flag gets set again * asynchronously, this will be called again before we return to * user mode. * * Called without locks. */ static inline void resume_user_mode_work(struct pt_regs *regs) { clear_thread_flag(TIF_NOTIFY_RESUME); /* * This barrier pairs with task_work_add()->set_notify_resume() after * hlist_add_head(task->task_works); */ smp_mb__after_atomic(); if (unlikely(task_work_pending(current))) task_work_run(); #ifdef CONFIG_KEYS_REQUEST_CACHE if (unlikely(current->cached_requested_key)) { key_put(current->cached_requested_key); current->cached_requested_key = NULL; } #endif mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); rseq_handle_notify_resume(NULL, regs); } #endif /* LINUX_RESUME_USER_MODE_H */
2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2019 ARM Ltd. * * Generic implementation of update_vsyscall and update_vsyscall_tz. * * Based on the x86 specific implementation. */ #include <linux/hrtimer.h> #include <linux/timekeeper_internal.h> #include <vdso/datapage.h> #include <vdso/helpers.h> #include <vdso/vsyscall.h> #include "timekeeping_internal.h" static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { struct vdso_timestamp *vdso_ts; u64 nsec, sec; vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; #endif vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; #endif vdata[CS_RAW].mask = tk->tkr_raw.mask; vdata[CS_RAW].mult = tk->tkr_raw.mult; vdata[CS_RAW].shift = tk->tkr_raw.shift; /* CLOCK_MONOTONIC */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* Copy MONOTONIC time for BOOTTIME */ sec = vdso_ts->sec; /* Add the boot offset */ sec += tk->monotonic_to_boot.tv_sec; nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { struct vdso_data *vdata = __arch_get_k_vdso_data(); struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; /* copy vsyscall data */ vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; vdata[CS_HRES_COARSE].clock_mode = clock_mode; vdata[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; /* CLOCK_MONOTONIC_COARSE */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); /* * Read without the seqlock held by clock_getres(). * Note: No need to have a second copy. */ WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_data(vdata, tk); __arch_update_vsyscall(vdata, tk); vdso_write_end(vdata); __arch_sync_vdso_data(vdata); } void update_vsyscall_tz(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; __arch_sync_vdso_data(vdata); } /** * vdso_update_begin - Start of a VDSO update section * * Allows architecture code to safely update the architecture specific VDSO * data. Disables interrupts, acquires timekeeper lock to serialize against * concurrent updates from timekeeping and invalidates the VDSO data * sequence counter to prevent concurrent readers from accessing * inconsistent data. * * Returns: Saved interrupt flags which need to be handed in to * vdso_update_end(). */ unsigned long vdso_update_begin(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); vdso_write_begin(vdata); return flags; } /** * vdso_update_end - End of a VDSO update section * @flags: Interrupt flags as returned from vdso_update_begin() * * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data * synchronization if the architecture requires it, drops timekeeper lock * and restores interrupt flags. */ void vdso_update_end(unsigned long flags) { struct vdso_data *vdata = __arch_get_k_vdso_data(); vdso_write_end(vdata); __arch_sync_vdso_data(vdata); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); }
50 77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 Regents of the University of California */ #ifndef _ASM_RISCV_SWITCH_TO_H #define _ASM_RISCV_SWITCH_TO_H #include <linux/jump_label.h> #include <linux/sched/task_stack.h> #include <linux/mm_types.h> #include <asm/vector.h> #include <asm/cpufeature.h> #include <asm/processor.h> #include <asm/ptrace.h> #include <asm/csr.h> #ifdef CONFIG_FPU extern void __fstate_save(struct task_struct *save_to); extern void __fstate_restore(struct task_struct *restore_from); static inline void __fstate_clean(struct pt_regs *regs) { regs->status = (regs->status & ~SR_FS) | SR_FS_CLEAN; } static inline void fstate_off(struct task_struct *task, struct pt_regs *regs) { regs->status = (regs->status & ~SR_FS) | SR_FS_OFF; } static inline void fstate_save(struct task_struct *task, struct pt_regs *regs) { if ((regs->status & SR_FS) == SR_FS_DIRTY) { __fstate_save(task); __fstate_clean(regs); } } static inline void fstate_restore(struct task_struct *task, struct pt_regs *regs) { if ((regs->status & SR_FS) != SR_FS_OFF) { __fstate_restore(task); __fstate_clean(regs); } } static inline void __switch_to_fpu(struct task_struct *prev, struct task_struct *next) { struct pt_regs *regs; regs = task_pt_regs(prev); fstate_save(prev, regs); fstate_restore(next, task_pt_regs(next)); } static __always_inline bool has_fpu(void) { return riscv_has_extension_likely(RISCV_ISA_EXT_f) || riscv_has_extension_likely(RISCV_ISA_EXT_d); } #else static __always_inline bool has_fpu(void) { return false; } #define fstate_save(task, regs) do { } while (0) #define fstate_restore(task, regs) do { } while (0) #define __switch_to_fpu(__prev, __next) do { } while (0) #endif extern struct task_struct *__switch_to(struct task_struct *, struct task_struct *); static inline bool switch_to_should_flush_icache(struct task_struct *task) { #ifdef CONFIG_SMP bool stale_mm = task->mm && task->mm->context.force_icache_flush; bool stale_thread = task->thread.force_icache_flush; bool thread_migrated = smp_processor_id() != task->thread.prev_cpu; return thread_migrated && (stale_mm || stale_thread); #else return false; #endif } #ifdef CONFIG_SMP #define __set_prev_cpu(thread) ((thread).prev_cpu = smp_processor_id()) #else #define __set_prev_cpu(thread) #endif #define switch_to(prev, next, last) \ do { \ struct task_struct *__prev = (prev); \ struct task_struct *__next = (next); \ __set_prev_cpu(__prev->thread); \ if (has_fpu()) \ __switch_to_fpu(__prev, __next); \ if (has_vector()) \ __switch_to_vector(__prev, __next); \ if (switch_to_should_flush_icache(__next)) \ local_flush_icache_all(); \ ((last) = __switch_to(__prev, __next)); \ } while (0) #endif /* _ASM_RISCV_SWITCH_TO_H */
1407 171 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM percpu #if !defined(_TRACE_PERCPU_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PERCPU_H #include <linux/tracepoint.h> #include <trace/events/mmflags.h> TRACE_EVENT(percpu_alloc_percpu, TP_PROTO(unsigned long call_site, bool reserved, bool is_atomic, size_t size, size_t align, void *base_addr, int off, void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags), TP_ARGS(call_site, reserved, is_atomic, size, align, base_addr, off, ptr, bytes_alloc, gfp_flags), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( bool, reserved ) __field( bool, is_atomic ) __field( size_t, size ) __field( size_t, align ) __field( void *, base_addr ) __field( int, off ) __field( void __percpu *, ptr ) __field( size_t, bytes_alloc ) __field( unsigned long, gfp_flags ) ), TP_fast_assign( __entry->call_site = call_site; __entry->reserved = reserved; __entry->is_atomic = is_atomic; __entry->size = size; __entry->align = align; __entry->base_addr = base_addr; __entry->off = off; __entry->ptr = ptr; __entry->bytes_alloc = bytes_alloc; __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("call_site=%pS reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p bytes_alloc=%zu gfp_flags=%s", (void *)__entry->call_site, __entry->reserved, __entry->is_atomic, __entry->size, __entry->align, __entry->base_addr, __entry->off, __entry->ptr, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags)) ); TRACE_EVENT(percpu_free_percpu, TP_PROTO(void *base_addr, int off, void __percpu *ptr), TP_ARGS(base_addr, off, ptr), TP_STRUCT__entry( __field( void *, base_addr ) __field( int, off ) __field( void __percpu *, ptr ) ), TP_fast_assign( __entry->base_addr = base_addr; __entry->off = off; __entry->ptr = ptr; ), TP_printk("base_addr=%p off=%d ptr=%p", __entry->base_addr, __entry->off, __entry->ptr) ); TRACE_EVENT(percpu_alloc_percpu_fail, TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align), TP_ARGS(reserved, is_atomic, size, align), TP_STRUCT__entry( __field( bool, reserved ) __field( bool, is_atomic ) __field( size_t, size ) __field( size_t, align ) ), TP_fast_assign( __entry->reserved = reserved; __entry->is_atomic = is_atomic; __entry->size = size; __entry->align = align; ), TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu", __entry->reserved, __entry->is_atomic, __entry->size, __entry->align) ); TRACE_EVENT(percpu_create_chunk, TP_PROTO(void *base_addr), TP_ARGS(base_addr), TP_STRUCT__entry( __field( void *, base_addr ) ), TP_fast_assign( __entry->base_addr = base_addr; ), TP_printk("base_addr=%p", __entry->base_addr) ); TRACE_EVENT(percpu_destroy_chunk, TP_PROTO(void *base_addr), TP_ARGS(base_addr), TP_STRUCT__entry( __field( void *, base_addr ) ), TP_fast_assign( __entry->base_addr = base_addr; ), TP_printk("base_addr=%p", __entry->base_addr) ); #endif /* _TRACE_PERCPU_H */ #include <trace/define_trace.h>
153 153 153 153 153 153 153 149 153 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 // SPDX-License-Identifier: GPL-2.0 /* * linux/drivers/char/misc.c * * Generic misc open routine by Johan Myreen * * Based on code from Linus * * Teemu Rantanen's Microsoft Busmouse support and Derrick Cole's * changes incorporated into 0.97pl4 * by Peter Cervasio (pete%q106fm.uucp@wupost.wustl.edu) (08SEP92) * See busmouse.c for particulars. * * Made things a lot mode modular - easy to compile in just one or two * of the misc drivers, as they are now completely independent. Linus. * * Support for loadable modules. 8-Sep-95 Philip Blundell <pjb27@cam.ac.uk> * * Fixed a failing symbol register to free the device registration * Alan Cox <alan@lxorguk.ukuu.org.uk> 21-Jan-96 * * Dynamic minors and /proc/mice by Alessandro Rubini. 26-Mar-96 * * Renamed to misc and miscdevice to be more accurate. Alan Cox 26-Mar-96 * * Handling of mouse minor numbers for kerneld: * Idea by Jacques Gelinas <jack@solucorp.qc.ca>, * adapted by Bjorn Ekwall <bj0rn@blox.se> * corrected by Alan Cox <alan@lxorguk.ukuu.org.uk> * * Changes for kmod (from kerneld): * Cyrus Durgin <cider@speakeasy.org> * * Added devfs support. Richard Gooch <rgooch@atnf.csiro.au> 10-Jan-1998 */ #include <linux/module.h> #include <linux/fs.h> #include <linux/errno.h> #include <linux/miscdevice.h> #include <linux/kernel.h> #include <linux/major.h> #include <linux/mutex.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/device.h> #include <linux/tty.h> #include <linux/kmod.h> #include <linux/gfp.h> /* * Head entry for the doubly linked miscdevice list */ static LIST_HEAD(misc_list); static DEFINE_MUTEX(misc_mtx); /* * Assigned numbers, used for dynamic minors */ #define DYNAMIC_MINORS 128 /* like dynamic majors */ static DEFINE_IDA(misc_minors_ida); static int misc_minor_alloc(void) { int ret; ret = ida_alloc_max(&misc_minors_ida, DYNAMIC_MINORS - 1, GFP_KERNEL); if (ret >= 0) { ret = DYNAMIC_MINORS - ret - 1; } else { ret = ida_alloc_range(&misc_minors_ida, MISC_DYNAMIC_MINOR + 1, MINORMASK, GFP_KERNEL); } return ret; } static void misc_minor_free(int minor) { if (minor < DYNAMIC_MINORS) ida_free(&misc_minors_ida, DYNAMIC_MINORS - minor - 1); else if (minor > MISC_DYNAMIC_MINOR) ida_free(&misc_minors_ida, minor); } #ifdef CONFIG_PROC_FS static void *misc_seq_start(struct seq_file *seq, loff_t *pos) { mutex_lock(&misc_mtx); return seq_list_start(&misc_list, *pos); } static void *misc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &misc_list, pos); } static void misc_seq_stop(struct seq_file *seq, void *v) { mutex_unlock(&misc_mtx); } static int misc_seq_show(struct seq_file *seq, void *v) { const struct miscdevice *p = list_entry(v, struct miscdevice, list); seq_printf(seq, "%3i %s\n", p->minor, p->name ? p->name : ""); return 0; } static const struct seq_operations misc_seq_ops = { .start = misc_seq_start, .next = misc_seq_next, .stop = misc_seq_stop, .show = misc_seq_show, }; #endif static int misc_open(struct inode *inode, struct file *file) { int minor = iminor(inode); struct miscdevice *c = NULL, *iter; int err = -ENODEV; const struct file_operations *new_fops = NULL; mutex_lock(&misc_mtx); list_for_each_entry(iter, &misc_list, list) { if (iter->minor != minor) continue; c = iter; new_fops = fops_get(iter->fops); break; } if (!new_fops) { mutex_unlock(&misc_mtx); request_module("char-major-%d-%d", MISC_MAJOR, minor); mutex_lock(&misc_mtx); list_for_each_entry(iter, &misc_list, list) { if (iter->minor != minor) continue; c = iter; new_fops = fops_get(iter->fops); break; } if (!new_fops) goto fail; } /* * Place the miscdevice in the file's * private_data so it can be used by the * file operations, including f_op->open below */ file->private_data = c; err = 0; replace_fops(file, new_fops); if (file->f_op->open) err = file->f_op->open(inode, file); fail: mutex_unlock(&misc_mtx); return err; } static char *misc_devnode(const struct device *dev, umode_t *mode) { const struct miscdevice *c = dev_get_drvdata(dev); if (mode && c->mode) *mode = c->mode; if (c->nodename) return kstrdup(c->nodename, GFP_KERNEL); return NULL; } static const struct class misc_class = { .name = "misc", .devnode = misc_devnode, }; static const struct file_operations misc_fops = { .owner = THIS_MODULE, .open = misc_open, .llseek = noop_llseek, }; /** * misc_register - register a miscellaneous device * @misc: device structure * * Register a miscellaneous device with the kernel. If the minor * number is set to %MISC_DYNAMIC_MINOR a minor number is assigned * and placed in the minor field of the structure. For other cases * the minor number requested is used. * * The structure passed is linked into the kernel and may not be * destroyed until it has been unregistered. By default, an open() * syscall to the device sets file->private_data to point to the * structure. Drivers don't need open in fops for this. * * A zero is returned on success and a negative errno code for * failure. */ int misc_register(struct miscdevice *misc) { dev_t dev; int err = 0; bool is_dynamic = (misc->minor == MISC_DYNAMIC_MINOR); INIT_LIST_HEAD(&misc->list); mutex_lock(&misc_mtx); if (is_dynamic) { int i = misc_minor_alloc(); if (i < 0) { err = -EBUSY; goto out; } misc->minor = i; } else { struct miscdevice *c; list_for_each_entry(c, &misc_list, list) { if (c->minor == misc->minor) { err = -EBUSY; goto out; } } } dev = MKDEV(MISC_MAJOR, misc->minor); misc->this_device = device_create_with_groups(&misc_class, misc->parent, dev, misc, misc->groups, "%s", misc->name); if (IS_ERR(misc->this_device)) { if (is_dynamic) { misc_minor_free(misc->minor); misc->minor = MISC_DYNAMIC_MINOR; } err = PTR_ERR(misc->this_device); goto out; } /* * Add it to the front, so that later devices can "override" * earlier defaults */ list_add(&misc->list, &misc_list); out: mutex_unlock(&misc_mtx); return err; } EXPORT_SYMBOL(misc_register); /** * misc_deregister - unregister a miscellaneous device * @misc: device to unregister * * Unregister a miscellaneous device that was previously * successfully registered with misc_register(). */ void misc_deregister(struct miscdevice *misc) { if (WARN_ON(list_empty(&misc->list))) return; mutex_lock(&misc_mtx); list_del(&misc->list); device_destroy(&misc_class, MKDEV(MISC_MAJOR, misc->minor)); misc_minor_free(misc->minor); mutex_unlock(&misc_mtx); } EXPORT_SYMBOL(misc_deregister); static int __init misc_init(void) { int err; struct proc_dir_entry *ret; ret = proc_create_seq("misc", 0, NULL, &misc_seq_ops); err = class_register(&misc_class); if (err) goto fail_remove; err = -EIO; if (register_chrdev(MISC_MAJOR, "misc", &misc_fops)) goto fail_printk; return 0; fail_printk: pr_err("unable to get major %d for misc devices\n", MISC_MAJOR); class_unregister(&misc_class); fail_remove: if (ret) remove_proc_entry("misc", NULL); return err; } subsys_initcall(misc_init);
59 59 59 59 59 59 59 58 59 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 // SPDX-License-Identifier: GPL-2.0 /* * Driver core interface to the pinctrl subsystem. * * Copyright (C) 2012 ST-Ericsson SA * Written on behalf of Linaro for ST-Ericsson * Based on bits of regulator core, gpio core and clk core * * Author: Linus Walleij <linus.walleij@linaro.org> */ #include <linux/device.h> #include <linux/pinctrl/devinfo.h> #include <linux/pinctrl/consumer.h> #include <linux/slab.h> /** * pinctrl_bind_pins() - called by the device core before probe * @dev: the device that is just about to probe */ int pinctrl_bind_pins(struct device *dev) { int ret; if (dev->of_node_reused) return 0; dev->pins = devm_kzalloc(dev, sizeof(*(dev->pins)), GFP_KERNEL); if (!dev->pins) return -ENOMEM; dev->pins->p = devm_pinctrl_get(dev); if (IS_ERR(dev->pins->p)) { dev_dbg(dev, "no pinctrl handle\n"); ret = PTR_ERR(dev->pins->p); goto cleanup_alloc; } dev->pins->default_state = pinctrl_lookup_state(dev->pins->p, PINCTRL_STATE_DEFAULT); if (IS_ERR(dev->pins->default_state)) { dev_dbg(dev, "no default pinctrl state\n"); ret = 0; goto cleanup_get; } dev->pins->init_state = pinctrl_lookup_state(dev->pins->p, PINCTRL_STATE_INIT); if (IS_ERR(dev->pins->init_state)) { /* Not supplying this state is perfectly legal */ dev_dbg(dev, "no init pinctrl state\n"); ret = pinctrl_select_state(dev->pins->p, dev->pins->default_state); } else { ret = pinctrl_select_state(dev->pins->p, dev->pins->init_state); } if (ret) { dev_dbg(dev, "failed to activate initial pinctrl state\n"); goto cleanup_get; } #ifdef CONFIG_PM /* * If power management is enabled, we also look for the optional * sleep and idle pin states, with semantics as defined in * <linux/pinctrl/pinctrl-state.h> */ dev->pins->sleep_state = pinctrl_lookup_state(dev->pins->p, PINCTRL_STATE_SLEEP); if (IS_ERR(dev->pins->sleep_state)) /* Not supplying this state is perfectly legal */ dev_dbg(dev, "no sleep pinctrl state\n"); dev->pins->idle_state = pinctrl_lookup_state(dev->pins->p, PINCTRL_STATE_IDLE); if (IS_ERR(dev->pins->idle_state)) /* Not supplying this state is perfectly legal */ dev_dbg(dev, "no idle pinctrl state\n"); #endif return 0; /* * If no pinctrl handle or default state was found for this device, * let's explicitly free the pin container in the device, there is * no point in keeping it around. */ cleanup_get: devm_pinctrl_put(dev->pins->p); cleanup_alloc: devm_kfree(dev, dev->pins); dev->pins = NULL; /* Return deferrals */ if (ret == -EPROBE_DEFER) return ret; /* Return serious errors */ if (ret == -EINVAL) return ret; /* We ignore errors like -ENOENT meaning no pinctrl state */ return 0; }
20 1049 19 203 209 124 179 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HIGHMEM_INTERNAL_H #define _LINUX_HIGHMEM_INTERNAL_H /* * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. */ #ifdef CONFIG_KMAP_LOCAL void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); void *__kmap_local_page_prot(struct page *page, pgprot_t prot); void kunmap_local_indexed(const void *vaddr); void kmap_local_fork(struct task_struct *tsk); void __kmap_local_sched_out(void); void __kmap_local_sched_in(void); static inline void kmap_assert_nomap(void) { DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx); } #else static inline void kmap_local_fork(struct task_struct *tsk) { } static inline void kmap_assert_nomap(void) { } #endif #ifdef CONFIG_HIGHMEM #include <asm/highmem.h> #ifndef ARCH_HAS_KMAP_FLUSH_TLB static inline void kmap_flush_tlb(unsigned long addr) { } #endif #ifndef kmap_prot #define kmap_prot PAGE_KERNEL #endif void *kmap_high(struct page *page); void kunmap_high(struct page *page); void __kmap_flush_unused(void); struct page *__kmap_to_page(void *addr); static inline void *kmap(struct page *page) { void *addr; might_sleep(); if (!PageHighMem(page)) addr = page_address(page); else addr = kmap_high(page); kmap_flush_tlb((unsigned long)addr); return addr; } static inline void kunmap(struct page *page) { might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); } static inline struct page *kmap_to_page(void *addr) { return __kmap_to_page(addr); } static inline void kmap_flush_unused(void) { __kmap_flush_unused(); } static inline void *kmap_local_page(struct page *page) { return __kmap_local_page_prot(page, kmap_prot); } static inline void *kmap_local_folio(struct folio *folio, size_t offset) { struct page *page = folio_page(folio, offset / PAGE_SIZE); return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE; } static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return __kmap_local_page_prot(page, prot); } static inline void *kmap_local_pfn(unsigned long pfn) { return __kmap_local_pfn_prot(pfn, kmap_prot); } static inline void __kunmap_local(const void *vaddr) { kunmap_local_indexed(vaddr); } static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return __kmap_local_page_prot(page, prot); } static inline void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); } static inline void *kmap_atomic_pfn(unsigned long pfn) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return __kmap_local_pfn_prot(pfn, kmap_prot); } static inline void __kunmap_atomic(const void *addr) { kunmap_local_indexed(addr); pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_enable(); else preempt_enable(); } unsigned int __nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; static inline unsigned int nr_free_highpages(void) { return __nr_free_highpages(); } static inline unsigned long totalhigh_pages(void) { return (unsigned long)atomic_long_read(&_totalhigh_pages); } static inline void totalhigh_pages_add(long count) { atomic_long_add(count, &_totalhigh_pages); } static inline bool is_kmap_addr(const void *x) { unsigned long addr = (unsigned long)x; return (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) || (addr >= __fix_to_virt(FIX_KMAP_END) && addr < __fix_to_virt(FIX_KMAP_BEGIN)); } #else /* CONFIG_HIGHMEM */ static inline struct page *kmap_to_page(void *addr) { return virt_to_page(addr); } static inline void *kmap(struct page *page) { might_sleep(); return page_address(page); } static inline void kunmap_high(struct page *page) { } static inline void kmap_flush_unused(void) { } static inline void kunmap(struct page *page) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(page_address(page)); #endif } static inline void *kmap_local_page(struct page *page) { return page_address(page); } static inline void *kmap_local_folio(struct folio *folio, size_t offset) { return page_address(&folio->page) + offset; } static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return kmap_local_page(page); } static inline void *kmap_local_pfn(unsigned long pfn) { return kmap_local_page(pfn_to_page(pfn)); } static inline void __kunmap_local(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif } static inline void *kmap_atomic(struct page *page) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return page_address(page); } static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { return kmap_atomic(page); } static inline void *kmap_atomic_pfn(unsigned long pfn) { return kmap_atomic(pfn_to_page(pfn)); } static inline void __kunmap_atomic(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_enable(); else preempt_enable(); } static inline unsigned int nr_free_highpages(void) { return 0; } static inline unsigned long totalhigh_pages(void) { return 0UL; } static inline bool is_kmap_addr(const void *x) { return false; } #endif /* CONFIG_HIGHMEM */ /** * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() - deprecated! * @__addr: Virtual address to be unmapped * * Unmaps an address previously mapped by kmap_atomic() and re-enables * pagefaults. Depending on PREEMP_RT configuration, re-enables also * migration and preemption. Users should not count on these side effects. * * Mappings should be unmapped in the reverse order that they were mapped. * See kmap_local_page() for details on nesting. * * @__addr can be any address within the mapped page, so there is no need * to subtract any offset that has been added. In contrast to kunmap(), * this function takes the address returned from kmap_atomic(), not the * page passed to it. The compiler will warn you if you pass the page. */ #define kunmap_atomic(__addr) \ do { \ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ __kunmap_atomic(__addr); \ } while (0) /** * kunmap_local - Unmap a page mapped via kmap_local_page(). * @__addr: An address within the page mapped * * @__addr can be any address within the mapped page. Commonly it is the * address return from kmap_local_page(), but it can also include offsets. * * Unmapping should be done in the reverse order of the mapping. See * kmap_local_page() for details. */ #define kunmap_local(__addr) \ do { \ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ __kunmap_local(__addr); \ } while (0) #endif
45 25 6 5 4 4 4 4 186 25 331 331 18 18 18 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/backing-dev.h * * low-level device information and state which is propagated up through * to high-level code. */ #ifndef _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H #include <linux/kernel.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/device.h> #include <linux/writeback.h> #include <linux/backing-dev-defs.h> #include <linux/slab.h> static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) { kref_get(&bdi->refcnt); return bdi; } struct backing_dev_info *bdi_get_by_id(u64 id); void bdi_put(struct backing_dev_info *bdi); __printf(2, 3) int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...); __printf(2, 0) int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args); void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); struct backing_dev_info *bdi_alloc(int node_id); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wait_for_completion(struct wb_completion *done); extern spinlock_t bdi_lock; extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { return test_bit(WB_has_dirty_io, &wb->state); } static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) { /* * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are * any dirty wbs. See wb_update_write_bandwidth(). */ return atomic_long_read(&bdi->tot_write_bandwidth); } static inline void wb_stat_mod(struct bdi_writeback *wb, enum wb_stat_item item, s64 amount) { percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); } static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { wb_stat_mod(wb, item, 1); } static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { wb_stat_mod(wb, item, -1); } static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_read_positive(&wb->stat[item]); } static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_sum_positive(&wb->stat[item]); } extern void wb_writeout_inc(struct bdi_writeback *wb); /* * maximal error of a stat counter. */ static inline unsigned long wb_stat_error(void) { #ifdef CONFIG_SMP return nr_cpu_ids * WB_STAT_BATCH; #else return 1; #endif } /* BDI ratio is expressed as part per 1000000 for finer granularity. */ #define BDI_RATIO_SCALE 10000 u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* * Flags in backing_dev_info::capability * * BDI_CAP_WRITEBACK: Supports dirty page writeback, and dirty pages * should contribute to accounting * BDI_CAP_WRITEBACK_ACCT: Automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold */ #define BDI_CAP_WRITEBACK (1 << 0) #define BDI_CAP_WRITEBACK_ACCT (1 << 1) #define BDI_CAP_STRICTLIMIT (1 << 2) extern struct backing_dev_info noop_backing_dev_info; int bdi_init(struct backing_dev_info *bdi); /** * writeback_in_progress - determine whether there is writeback in progress * @wb: bdi_writeback of interest * * Determine whether there is writeback waiting to be handled against a * bdi_writeback. */ static inline bool writeback_in_progress(struct bdi_writeback *wb) { return test_bit(WB_writeback_running, &wb->state); } struct backing_dev_info *inode_to_bdi(struct inode *inode); static inline bool mapping_can_writeback(struct address_space *mapping) { return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css); struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct cgroup_subsys_state *css); /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode * @inode: inode of interest * * Cgroup writeback requires support from the filesystem. Also, both memcg and * iocg have to be on the default hierarchy. Test whether all conditions are * met. * * Note that the test result may change dynamically on the same inode * depending on how memcg and iocg are configured. */ static inline bool inode_cgwb_enabled(struct inode *inode) { struct backing_dev_info *bdi = inode_to_bdi(inode); return cgroup_subsys_on_dfl(memory_cgrp_subsys) && cgroup_subsys_on_dfl(io_cgrp_subsys) && (bdi->capabilities & BDI_CAP_WRITEBACK) && (inode->i_sb->s_iflags & SB_I_CGROUPWB); } /** * wb_find_current - find wb for %current on a bdi * @bdi: bdi of interest * * Find the wb of @bdi which matches both the memcg and blkcg of %current. * Must be called under rcu_read_lock() which protects the returend wb. * NULL if not found. */ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; memcg_css = task_css(current, memory_cgrp_id); if (!memcg_css->parent) return &bdi->wb; wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); /* * %current's blkcg equals the effective blkcg of its memcg. No * need to use the relatively expensive cgroup_get_e_css(). */ if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id))) return wb; return NULL; } /** * wb_get_create_current - get or create wb for %current on a bdi * @bdi: bdi of interest * @gfp: allocation mask * * Equivalent to wb_get_create() on %current's memcg. This function is * called from a relatively hot path and optimizes the common cases using * wb_find_current(). */ static inline struct bdi_writeback * wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) { struct bdi_writeback *wb; rcu_read_lock(); wb = wb_find_current(bdi); if (wb && unlikely(!wb_tryget(wb))) wb = NULL; rcu_read_unlock(); if (unlikely(!wb)) { struct cgroup_subsys_state *memcg_css; memcg_css = task_get_css(current, memory_cgrp_id); wb = wb_get_create(bdi, memcg_css, gfp); css_put(memcg_css); } return wb; } /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest * * Returns the wb @inode is currently associated with. The caller must be * holding either @inode->i_lock, the i_pages lock, or the * associated wb's list_lock. */ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&inode->i_lock) && !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) && !lockdep_is_held(&inode->i_wb->list_lock))); #endif return inode->i_wb; } static inline struct bdi_writeback *inode_to_wb_wbc( struct inode *inode, struct writeback_control *wbc) { /* * If wbc does not have inode attached, it means cgroup writeback was * disabled when wbc started. Just use the default wb in that case. */ return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb; } /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode * @cookie: output param, to be passed to the end function * * The caller wants to access the wb associated with @inode but isn't * holding inode->i_lock, the i_pages lock or wb->list_lock. This * function determines the wb associated with @inode and ensures that the * association doesn't change until the transaction is finished with * unlocked_inode_to_wb_end(). * * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and * can't sleep during the transaction. IRQs may or may not be disabled on * return. */ static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { rcu_read_lock(); /* * Paired with store_release in inode_switch_wbs_work_fn() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; if (unlikely(cookie->locked)) xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags); /* * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages * lock. inode_to_wb() will bark. Deref directly. */ return inode->i_wb; } /** * unlocked_inode_to_wb_end - end inode wb access transaction * @inode: target inode * @cookie: @cookie from unlocked_inode_to_wb_begin() */ static inline void unlocked_inode_to_wb_end(struct inode *inode, struct wb_lock_cookie *cookie) { if (unlikely(cookie->locked)) xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags); rcu_read_unlock(); } #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool inode_cgwb_enabled(struct inode *inode) { return false; } static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { return &bdi->wb; } static inline struct bdi_writeback * wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) { return &bdi->wb; } static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; } static inline struct bdi_writeback *inode_to_wb_wbc( struct inode *inode, struct writeback_control *wbc) { return inode_to_wb(inode); } static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { return inode_to_wb(inode); } static inline void unlocked_inode_to_wb_end(struct inode *inode, struct wb_lock_cookie *cookie) { } static inline void wb_memcg_offline(struct mem_cgroup *memcg) { } static inline void wb_blkcg_offline(struct cgroup_subsys_state *css) { } #endif /* CONFIG_CGROUP_WRITEBACK */ const char *bdi_dev_name(struct backing_dev_info *bdi); #endif /* _LINUX_BACKING_DEV_H */
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * AEAD: Authenticated Encryption with Associated Data * * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_INTERNAL_AEAD_H #define _CRYPTO_INTERNAL_AEAD_H #include <crypto/aead.h> #include <crypto/algapi.h> #include <linux/stddef.h> #include <linux/types.h> struct rtattr; struct aead_instance { void (*free)(struct aead_instance *inst); union { struct { char head[offsetof(struct aead_alg, base)]; struct crypto_instance base; } s; struct aead_alg alg; }; }; struct crypto_aead_spawn { struct crypto_spawn base; }; struct aead_queue { struct crypto_queue base; }; static inline void *crypto_aead_ctx(struct crypto_aead *tfm) { return crypto_tfm_ctx(&tfm->base); } static inline void *crypto_aead_ctx_dma(struct crypto_aead *tfm) { return crypto_tfm_ctx_dma(&tfm->base); } static inline struct crypto_instance *aead_crypto_instance( struct aead_instance *inst) { return container_of(&inst->alg.base, struct crypto_instance, alg); } static inline struct aead_instance *aead_instance(struct crypto_instance *inst) { return container_of(&inst->alg, struct aead_instance, alg.base); } static inline struct aead_instance *aead_alg_instance(struct crypto_aead *aead) { return aead_instance(crypto_tfm_alg_instance(&aead->base)); } static inline void *aead_instance_ctx(struct aead_instance *inst) { return crypto_instance_ctx(aead_crypto_instance(inst)); } static inline void *aead_request_ctx(struct aead_request *req) { return req->__ctx; } static inline void *aead_request_ctx_dma(struct aead_request *req) { unsigned int align = crypto_dma_align(); if (align <= crypto_tfm_ctx_alignment()) align = 1; return PTR_ALIGN(aead_request_ctx(req), align); } static inline void aead_request_complete(struct aead_request *req, int err) { crypto_request_complete(&req->base, err); } static inline u32 aead_request_flags(struct aead_request *req) { return req->base.flags; } static inline struct aead_request *aead_request_cast( struct crypto_async_request *req) { return container_of(req, struct aead_request, base); } int crypto_grab_aead(struct crypto_aead_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); static inline void crypto_drop_aead(struct crypto_aead_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline struct aead_alg *crypto_spawn_aead_alg( struct crypto_aead_spawn *spawn) { return container_of(spawn->base.alg, struct aead_alg, base); } static inline struct crypto_aead *crypto_spawn_aead( struct crypto_aead_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline void crypto_aead_set_reqsize(struct crypto_aead *aead, unsigned int reqsize) { aead->reqsize = reqsize; } static inline void crypto_aead_set_reqsize_dma(struct crypto_aead *aead, unsigned int reqsize) { reqsize += crypto_dma_align() & ~(crypto_tfm_ctx_alignment() - 1); aead->reqsize = reqsize; } static inline void aead_init_queue(struct aead_queue *queue, unsigned int max_qlen) { crypto_init_queue(&queue->base, max_qlen); } static inline unsigned int crypto_aead_alg_chunksize(struct aead_alg *alg) { return alg->chunksize; } /** * crypto_aead_chunksize() - obtain chunk size * @tfm: cipher handle * * The block size is set to one for ciphers such as CCM. However, * you still need to provide incremental updates in multiples of * the underlying block size as the IV does not have sub-block * granularity. This is known in this API as the chunk size. * * Return: chunk size in bytes */ static inline unsigned int crypto_aead_chunksize(struct crypto_aead *tfm) { return crypto_aead_alg_chunksize(crypto_aead_alg(tfm)); } int crypto_register_aead(struct aead_alg *alg); void crypto_unregister_aead(struct aead_alg *alg); int crypto_register_aeads(struct aead_alg *algs, int count); void crypto_unregister_aeads(struct aead_alg *algs, int count); int aead_register_instance(struct crypto_template *tmpl, struct aead_instance *inst); #endif /* _CRYPTO_INTERNAL_AEAD_H */
289 289 288 150 289 289 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/symlink.c - kernfs symlink implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/gfp.h> #include <linux/namei.h> #include "kernfs-internal.h" /** * kernfs_create_link - create a symlink * @parent: directory to create the symlink in * @name: name of the symlink * @target: target node for the symlink to point to * * Return: the created node on success, ERR_PTR() value on error. * Ownership of the link matches ownership of the target. */ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target) { struct kernfs_node *kn; int error; kuid_t uid = GLOBAL_ROOT_UID; kgid_t gid = GLOBAL_ROOT_GID; if (target->iattr) { uid = target->iattr->ia_uid; gid = target->iattr->ia_gid; } kn = kernfs_new_node(parent, name, S_IFLNK|0777, uid, gid, KERNFS_LINK); if (!kn) return ERR_PTR(-ENOMEM); if (kernfs_ns_enabled(parent)) kn->ns = target->ns; kn->symlink.target_kn = target; kernfs_get(target); /* ref owned by symlink */ error = kernfs_add_one(kn); if (!error) return kn; kernfs_put(kn); return ERR_PTR(error); } static int kernfs_get_target_path(struct kernfs_node *parent, struct kernfs_node *target, char *path) { struct kernfs_node *base, *kn; char *s = path; int len = 0; /* go up to the root, stop at the base */ base = parent; while (base->parent) { kn = target->parent; while (kn->parent && base != kn) kn = kn->parent; if (base == kn) break; if ((s - path) + 3 >= PATH_MAX) return -ENAMETOOLONG; strcpy(s, "../"); s += 3; base = base->parent; } /* determine end of target string for reverse fillup */ kn = target; while (kn->parent && kn != base) { len += strlen(kn->name) + 1; kn = kn->parent; } /* check limits */ if (len < 2) return -EINVAL; len--; if ((s - path) + len >= PATH_MAX) return -ENAMETOOLONG; /* reverse fillup of target string from target to base */ kn = target; while (kn->parent && kn != base) { int slen = strlen(kn->name); len -= slen; memcpy(s + len, kn->name, slen); if (len) s[--len] = '/'; kn = kn->parent; } return 0; } static int kernfs_getlink(struct inode *inode, char *path) { struct kernfs_node *kn = inode->i_private; struct kernfs_node *parent = kn->parent; struct kernfs_node *target = kn->symlink.target_kn; struct kernfs_root *root = kernfs_root(parent); int error; down_read(&root->kernfs_rwsem); error = kernfs_get_target_path(parent, target, path); up_read(&root->kernfs_rwsem); return error; } static const char *kernfs_iop_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { char *body; int error; if (!dentry) return ERR_PTR(-ECHILD); body = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!body) return ERR_PTR(-ENOMEM); error = kernfs_getlink(inode, body); if (unlikely(error < 0)) { kfree(body); return ERR_PTR(error); } set_delayed_call(done, kfree_link, body); return body; } const struct inode_operations kernfs_symlink_iops = { .listxattr = kernfs_iop_listxattr, .get_link = kernfs_iop_get_link, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .permission = kernfs_iop_permission, };
6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 /* SPDX-License-Identifier: GPL-2.0 * * IO cost model based controller. * * Copyright (C) 2019 Tejun Heo <tj@kernel.org> * Copyright (C) 2019 Andy Newell <newella@fb.com> * Copyright (C) 2019 Facebook * * One challenge of controlling IO resources is the lack of trivially * observable cost metric. This is distinguished from CPU and memory where * wallclock time and the number of bytes can serve as accurate enough * approximations. * * Bandwidth and iops are the most commonly used metrics for IO devices but * depending on the type and specifics of the device, different IO patterns * easily lead to multiple orders of magnitude variations rendering them * useless for the purpose of IO capacity distribution. While on-device * time, with a lot of clutches, could serve as a useful approximation for * non-queued rotational devices, this is no longer viable with modern * devices, even the rotational ones. * * While there is no cost metric we can trivially observe, it isn't a * complete mystery. For example, on a rotational device, seek cost * dominates while a contiguous transfer contributes a smaller amount * proportional to the size. If we can characterize at least the relative * costs of these different types of IOs, it should be possible to * implement a reasonable work-conserving proportional IO resource * distribution. * * 1. IO Cost Model * * IO cost model estimates the cost of an IO given its basic parameters and * history (e.g. the end sector of the last IO). The cost is measured in * device time. If a given IO is estimated to cost 10ms, the device should * be able to process ~100 of those IOs in a second. * * Currently, there's only one builtin cost model - linear. Each IO is * classified as sequential or random and given a base cost accordingly. * On top of that, a size cost proportional to the length of the IO is * added. While simple, this model captures the operational * characteristics of a wide varienty of devices well enough. Default * parameters for several different classes of devices are provided and the * parameters can be configured from userspace via * /sys/fs/cgroup/io.cost.model. * * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate * device-specific coefficients. * * 2. Control Strategy * * The device virtual time (vtime) is used as the primary control metric. * The control strategy is composed of the following three parts. * * 2-1. Vtime Distribution * * When a cgroup becomes active in terms of IOs, its hierarchical share is * calculated. Please consider the following hierarchy where the numbers * inside parentheses denote the configured weights. * * root * / \ * A (w:100) B (w:300) * / \ * A0 (w:100) A1 (w:100) * * If B is idle and only A0 and A1 are actively issuing IOs, as the two are * of equal weight, each gets 50% share. If then B starts issuing IOs, B * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest, * 12.5% each. The distribution mechanism only cares about these flattened * shares. They're called hweights (hierarchical weights) and always add * upto 1 (WEIGHT_ONE). * * A given cgroup's vtime runs slower in inverse proportion to its hweight. * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5) * against the device vtime - an IO which takes 10ms on the underlying * device is considered to take 80ms on A0. * * This constitutes the basis of IO capacity distribution. Each cgroup's * vtime is running at a rate determined by its hweight. A cgroup tracks * the vtime consumed by past IOs and can issue a new IO if doing so * wouldn't outrun the current device vtime. Otherwise, the IO is * suspended until the vtime has progressed enough to cover it. * * 2-2. Vrate Adjustment * * It's unrealistic to expect the cost model to be perfect. There are too * many devices and even on the same device the overall performance * fluctuates depending on numerous factors such as IO mixture and device * internal garbage collection. The controller needs to adapt dynamically. * * This is achieved by adjusting the overall IO rate according to how busy * the device is. If the device becomes overloaded, we're sending down too * many IOs and should generally slow down. If there are waiting issuers * but the device isn't saturated, we're issuing too few and should * generally speed up. * * To slow down, we lower the vrate - the rate at which the device vtime * passes compared to the wall clock. For example, if the vtime is running * at the vrate of 75%, all cgroups added up would only be able to issue * 750ms worth of IOs per second, and vice-versa for speeding up. * * Device business is determined using two criteria - rq wait and * completion latencies. * * When a device gets saturated, the on-device and then the request queues * fill up and a bio which is ready to be issued has to wait for a request * to become available. When this delay becomes noticeable, it's a clear * indication that the device is saturated and we lower the vrate. This * saturation signal is fairly conservative as it only triggers when both * hardware and software queues are filled up, and is used as the default * busy signal. * * As devices can have deep queues and be unfair in how the queued commands * are executed, solely depending on rq wait may not result in satisfactory * control quality. For a better control quality, completion latency QoS * parameters can be configured so that the device is considered saturated * if N'th percentile completion latency rises above the set point. * * The completion latency requirements are a function of both the * underlying device characteristics and the desired IO latency quality of * service. There is an inherent trade-off - the tighter the latency QoS, * the higher the bandwidth lossage. Latency QoS is disabled by default * and can be set through /sys/fs/cgroup/io.cost.qos. * * 2-3. Work Conservation * * Imagine two cgroups A and B with equal weights. A is issuing a small IO * periodically while B is sending out enough parallel IOs to saturate the * device on its own. Let's say A's usage amounts to 100ms worth of IO * cost per second, i.e., 10% of the device capacity. The naive * distribution of half and half would lead to 60% utilization of the * device, a significant reduction in the total amount of work done * compared to free-for-all competition. This is too high a cost to pay * for IO control. * * To conserve the total amount of work done, we keep track of how much * each active cgroup is actually using and yield part of its weight if * there are other cgroups which can make use of it. In the above case, * A's weight will be lowered so that it hovers above the actual usage and * B would be able to use the rest. * * As we don't want to penalize a cgroup for donating its weight, the * surplus weight adjustment factors in a margin and has an immediate * snapback mechanism in case the cgroup needs more IO vtime for itself. * * Note that adjusting down surplus weights has the same effects as * accelerating vtime for other cgroups and work conservation can also be * implemented by adjusting vrate dynamically. However, squaring who can * donate and should take back how much requires hweight propagations * anyway making it easier to implement and understand as a separate * mechanism. * * 3. Monitoring * * Instead of debugfs or other clumsy monitoring mechanisms, this * controller uses a drgn based monitoring script - * tools/cgroup/iocost_monitor.py. For details on drgn, please see * https://github.com/osandov/drgn. The output looks like the following. * * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12% * active weight hweight% inflt% dbt delay usages% * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033 * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077 * * - per : Timer period * - cur_per : Internal wall and device vtime clock * - vrate : Device virtual time rate against wall clock * - weight : Surplus-adjusted and configured weights * - hweight : Surplus-adjusted and configured hierarchical weights * - inflt : The percentage of in-flight IO cost at the end of last period * - del_ms : Deferred issuer delay induction level and duration * - usages : Usage history */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/time64.h> #include <linux/parser.h> #include <linux/sched/signal.h> #include <asm/local.h> #include <asm/local64.h> #include "blk-rq-qos.h" #include "blk-stat.h" #include "blk-wbt.h" #include "blk-cgroup.h" #ifdef CONFIG_TRACEPOINTS /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */ #define TRACE_IOCG_PATH_LEN 1024 static DEFINE_SPINLOCK(trace_iocg_path_lock); static char trace_iocg_path[TRACE_IOCG_PATH_LEN]; #define TRACE_IOCG_PATH(type, iocg, ...) \ do { \ unsigned long flags; \ if (trace_iocost_##type##_enabled()) { \ spin_lock_irqsave(&trace_iocg_path_lock, flags); \ cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \ trace_iocg_path, TRACE_IOCG_PATH_LEN); \ trace_iocost_##type(iocg, trace_iocg_path, \ ##__VA_ARGS__); \ spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \ } \ } while (0) #else /* CONFIG_TRACE_POINTS */ #define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0) #endif /* CONFIG_TRACE_POINTS */ enum { MILLION = 1000000, /* timer period is calculated from latency requirements, bound it */ MIN_PERIOD = USEC_PER_MSEC, MAX_PERIOD = USEC_PER_SEC, /* * iocg->vtime is targeted at 50% behind the device vtime, which * serves as its IO credit buffer. Surplus weight adjustment is * immediately canceled if the vtime margin runs below 10%. */ MARGIN_MIN_PCT = 10, MARGIN_LOW_PCT = 20, MARGIN_TARGET_PCT = 50, INUSE_ADJ_STEP_PCT = 25, /* Have some play in timer operations */ TIMER_SLACK_PCT = 1, /* 1/64k is granular enough and can easily be handled w/ u32 */ WEIGHT_ONE = 1 << 16, }; enum { /* * As vtime is used to calculate the cost of each IO, it needs to * be fairly high precision. For example, it should be able to * represent the cost of a single page worth of discard with * suffificient accuracy. At the same time, it should be able to * represent reasonably long enough durations to be useful and * convenient during operation. * * 1s worth of vtime is 2^37. This gives us both sub-nanosecond * granularity and days of wrap-around time even at extreme vrates. */ VTIME_PER_SEC_SHIFT = 37, VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT, VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC, VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC, /* bound vrate adjustments within two orders of magnitude */ VRATE_MIN_PPM = 10000, /* 1% */ VRATE_MAX_PPM = 100000000, /* 10000% */ VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION, VRATE_CLAMP_ADJ_PCT = 4, /* switch iff the conditions are met for longer than this */ AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, }; enum { /* if IOs end up waiting for requests, issue less */ RQ_WAIT_BUSY_PCT = 5, /* unbusy hysterisis */ UNBUSY_THR_PCT = 75, /* * The effect of delay is indirect and non-linear and a huge amount of * future debt can accumulate abruptly while unthrottled. Linearly scale * up delay as debt is going up and then let it decay exponentially. * This gives us quick ramp ups while delay is accumulating and long * tails which can help reducing the frequency of debt explosions on * unthrottle. The parameters are experimentally determined. * * The delay mechanism provides adequate protection and behavior in many * cases. However, this is far from ideal and falls shorts on both * fronts. The debtors are often throttled too harshly costing a * significant level of fairness and possibly total work while the * protection against their impacts on the system can be choppy and * unreliable. * * The shortcoming primarily stems from the fact that, unlike for page * cache, the kernel doesn't have well-defined back-pressure propagation * mechanism and policies for anonymous memory. Fully addressing this * issue will likely require substantial improvements in the area. */ MIN_DELAY_THR_PCT = 500, MAX_DELAY_THR_PCT = 25000, MIN_DELAY = 250, MAX_DELAY = 250 * USEC_PER_MSEC, /* halve debts if avg usage over 100ms is under 50% */ DFGV_USAGE_PCT = 50, DFGV_PERIOD = 100 * USEC_PER_MSEC, /* don't let cmds which take a very long time pin lagging for too long */ MAX_LAGGING_PERIODS = 10, /* * Count IO size in 4k pages. The 12bit shift helps keeping * size-proportional components of cost calculation in closer * numbers of digits to per-IO cost components. */ IOC_PAGE_SHIFT = 12, IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT, IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT, /* if apart further than 16M, consider randio for linear model */ LCOEF_RANDIO_PAGES = 4096, }; enum ioc_running { IOC_IDLE, IOC_RUNNING, IOC_STOP, }; /* io.cost.qos controls including per-dev enable of the whole controller */ enum { QOS_ENABLE, QOS_CTRL, NR_QOS_CTRL_PARAMS, }; /* io.cost.qos params */ enum { QOS_RPPM, QOS_RLAT, QOS_WPPM, QOS_WLAT, QOS_MIN, QOS_MAX, NR_QOS_PARAMS, }; /* io.cost.model controls */ enum { COST_CTRL, COST_MODEL, NR_COST_CTRL_PARAMS, }; /* builtin linear cost model coefficients */ enum { I_LCOEF_RBPS, I_LCOEF_RSEQIOPS, I_LCOEF_RRANDIOPS, I_LCOEF_WBPS, I_LCOEF_WSEQIOPS, I_LCOEF_WRANDIOPS, NR_I_LCOEFS, }; enum { LCOEF_RPAGE, LCOEF_RSEQIO, LCOEF_RRANDIO, LCOEF_WPAGE, LCOEF_WSEQIO, LCOEF_WRANDIO, NR_LCOEFS, }; enum { AUTOP_INVALID, AUTOP_HDD, AUTOP_SSD_QD1, AUTOP_SSD_DFL, AUTOP_SSD_FAST, }; struct ioc_params { u32 qos[NR_QOS_PARAMS]; u64 i_lcoefs[NR_I_LCOEFS]; u64 lcoefs[NR_LCOEFS]; u32 too_fast_vrate_pct; u32 too_slow_vrate_pct; }; struct ioc_margins { s64 min; s64 low; s64 target; }; struct ioc_missed { local_t nr_met; local_t nr_missed; u32 last_met; u32 last_missed; }; struct ioc_pcpu_stat { struct ioc_missed missed[2]; local64_t rq_wait_ns; u64 last_rq_wait_ns; }; /* per device */ struct ioc { struct rq_qos rqos; bool enabled; struct ioc_params params; struct ioc_margins margins; u32 period_us; u32 timer_slack_ns; u64 vrate_min; u64 vrate_max; spinlock_t lock; struct timer_list timer; struct list_head active_iocgs; /* active cgroups */ struct ioc_pcpu_stat __percpu *pcpu_stat; enum ioc_running running; atomic64_t vtime_rate; u64 vtime_base_rate; s64 vtime_err; seqcount_spinlock_t period_seqcount; u64 period_at; /* wallclock starttime */ u64 period_at_vtime; /* vtime starttime */ atomic64_t cur_period; /* inc'd each period */ int busy_level; /* saturation history */ bool weights_updated; atomic_t hweight_gen; /* for lazy hweights */ /* debt forgivness */ u64 dfgv_period_at; u64 dfgv_period_rem; u64 dfgv_usage_us_sum; u64 autop_too_fast_at; u64 autop_too_slow_at; int autop_idx; bool user_qos_params:1; bool user_cost_model:1; }; struct iocg_pcpu_stat { local64_t abs_vusage; }; struct iocg_stat { u64 usage_us; u64 wait_us; u64 indebt_us; u64 indelay_us; }; /* per device-cgroup pair */ struct ioc_gq { struct blkg_policy_data pd; struct ioc *ioc; /* * A iocg can get its weight from two sources - an explicit * per-device-cgroup configuration or the default weight of the * cgroup. `cfg_weight` is the explicit per-device-cgroup * configuration. `weight` is the effective considering both * sources. * * When an idle cgroup becomes active its `active` goes from 0 to * `weight`. `inuse` is the surplus adjusted active weight. * `active` and `inuse` are used to calculate `hweight_active` and * `hweight_inuse`. * * `last_inuse` remembers `inuse` while an iocg is idle to persist * surplus adjustments. * * `inuse` may be adjusted dynamically during period. `saved_*` are used * to determine and track adjustments. */ u32 cfg_weight; u32 weight; u32 active; u32 inuse; u32 last_inuse; s64 saved_margin; sector_t cursor; /* to detect randio */ /* * `vtime` is this iocg's vtime cursor which progresses as IOs are * issued. If lagging behind device vtime, the delta represents * the currently available IO budget. If running ahead, the * overage. * * `vtime_done` is the same but progressed on completion rather * than issue. The delta behind `vtime` represents the cost of * currently in-flight IOs. */ atomic64_t vtime; atomic64_t done_vtime; u64 abs_vdebt; /* current delay in effect and when it started */ u64 delay; u64 delay_at; /* * The period this iocg was last active in. Used for deactivation * and invalidating `vtime`. */ atomic64_t active_period; struct list_head active_list; /* see __propagate_weights() and current_hweight() for details */ u64 child_active_sum; u64 child_inuse_sum; u64 child_adjusted_sum; int hweight_gen; u32 hweight_active; u32 hweight_inuse; u32 hweight_donating; u32 hweight_after_donation; struct list_head walk_list; struct list_head surplus_list; struct wait_queue_head waitq; struct hrtimer waitq_timer; /* timestamp at the latest activation */ u64 activated_at; /* statistics */ struct iocg_pcpu_stat __percpu *pcpu_stat; struct iocg_stat stat; struct iocg_stat last_stat; u64 last_stat_abs_vusage; u64 usage_delta_us; u64 wait_since; u64 indebt_since; u64 indelay_since; /* this iocg's depth in the hierarchy and ancestors including self */ int level; struct ioc_gq *ancestors[]; }; /* per cgroup */ struct ioc_cgrp { struct blkcg_policy_data cpd; unsigned int dfl_weight; }; struct ioc_now { u64 now_ns; u64 now; u64 vnow; }; struct iocg_wait { struct wait_queue_entry wait; struct bio *bio; u64 abs_cost; bool committed; }; struct iocg_wake_ctx { struct ioc_gq *iocg; u32 hw_inuse; s64 vbudget; }; static const struct ioc_params autop[] = { [AUTOP_HDD] = { .qos = { [QOS_RLAT] = 250000, /* 250ms */ [QOS_WLAT] = 250000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 174019176, [I_LCOEF_RSEQIOPS] = 41708, [I_LCOEF_RRANDIOPS] = 370, [I_LCOEF_WBPS] = 178075866, [I_LCOEF_WSEQIOPS] = 42705, [I_LCOEF_WRANDIOPS] = 378, }, }, [AUTOP_SSD_QD1] = { .qos = { [QOS_RLAT] = 25000, /* 25ms */ [QOS_WLAT] = 25000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 245855193, [I_LCOEF_RSEQIOPS] = 61575, [I_LCOEF_RRANDIOPS] = 6946, [I_LCOEF_WBPS] = 141365009, [I_LCOEF_WSEQIOPS] = 33716, [I_LCOEF_WRANDIOPS] = 26796, }, }, [AUTOP_SSD_DFL] = { .qos = { [QOS_RLAT] = 25000, /* 25ms */ [QOS_WLAT] = 25000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 488636629, [I_LCOEF_RSEQIOPS] = 8932, [I_LCOEF_RRANDIOPS] = 8518, [I_LCOEF_WBPS] = 427891549, [I_LCOEF_WSEQIOPS] = 28755, [I_LCOEF_WRANDIOPS] = 21940, }, .too_fast_vrate_pct = 500, }, [AUTOP_SSD_FAST] = { .qos = { [QOS_RLAT] = 5000, /* 5ms */ [QOS_WLAT] = 5000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 3102524156LLU, [I_LCOEF_RSEQIOPS] = 724816, [I_LCOEF_RRANDIOPS] = 778122, [I_LCOEF_WBPS] = 1742780862LLU, [I_LCOEF_WSEQIOPS] = 425702, [I_LCOEF_WRANDIOPS] = 443193, }, .too_slow_vrate_pct = 10, }, }; /* * vrate adjust percentages indexed by ioc->busy_level. We adjust up on * vtime credit shortage and down on device saturation. */ static u32 vrate_adj_pct[] = { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 }; static struct blkcg_policy blkcg_policy_iocost; /* accessors and helpers */ static struct ioc *rqos_to_ioc(struct rq_qos *rqos) { return container_of(rqos, struct ioc, rqos); } static struct ioc *q_to_ioc(struct request_queue *q) { return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST)); } static const char __maybe_unused *ioc_name(struct ioc *ioc) { struct gendisk *disk = ioc->rqos.disk; if (!disk) return "<unknown>"; return disk->disk_name; } static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct ioc_gq, pd) : NULL; } static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg) { return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost)); } static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg) { return pd_to_blkg(&iocg->pd); } static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg) { return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost), struct ioc_cgrp, cpd); } /* * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical * weight, the more expensive each IO. Must round up. */ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) { return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse); } /* * The inverse of abs_cost_to_cost(). Must round up. */ static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) { return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE); } static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 abs_cost, u64 cost) { struct iocg_pcpu_stat *gcs; bio->bi_iocost_cost = cost; atomic64_add(cost, &iocg->vtime); gcs = get_cpu_ptr(iocg->pcpu_stat); local64_add(abs_cost, &gcs->abs_vusage); put_cpu_ptr(gcs); } static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags) { if (lock_ioc) { spin_lock_irqsave(&iocg->ioc->lock, *flags); spin_lock(&iocg->waitq.lock); } else { spin_lock_irqsave(&iocg->waitq.lock, *flags); } } static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags) { if (unlock_ioc) { spin_unlock(&iocg->waitq.lock); spin_unlock_irqrestore(&iocg->ioc->lock, *flags); } else { spin_unlock_irqrestore(&iocg->waitq.lock, *flags); } } #define CREATE_TRACE_POINTS #include <trace/events/iocost.h> static void ioc_refresh_margins(struct ioc *ioc) { struct ioc_margins *margins = &ioc->margins; u32 period_us = ioc->period_us; u64 vrate = ioc->vtime_base_rate; margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; } /* latency Qos params changed, update period_us and all the dependent params */ static void ioc_refresh_period_us(struct ioc *ioc) { u32 ppm, lat, multi, period_us; lockdep_assert_held(&ioc->lock); /* pick the higher latency target */ if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) { ppm = ioc->params.qos[QOS_RPPM]; lat = ioc->params.qos[QOS_RLAT]; } else { ppm = ioc->params.qos[QOS_WPPM]; lat = ioc->params.qos[QOS_WLAT]; } /* * We want the period to be long enough to contain a healthy number * of IOs while short enough for granular control. Define it as a * multiple of the latency target. Ideally, the multiplier should * be scaled according to the percentile so that it would nominally * contain a certain number of requests. Let's be simpler and * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50). */ if (ppm) multi = max_t(u32, (MILLION - ppm) / 50000, 2); else multi = 2; period_us = multi * lat; period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD); /* calculate dependent params */ ioc->period_us = period_us; ioc->timer_slack_ns = div64_u64( (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT, 100); ioc_refresh_margins(ioc); } /* * ioc->rqos.disk isn't initialized when this function is called from * the init path. */ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk) { int idx = ioc->autop_idx; const struct ioc_params *p = &autop[idx]; u32 vrate_pct; u64 now_ns; /* rotational? */ if (!blk_queue_nonrot(disk->queue)) return AUTOP_HDD; /* handle SATA SSDs w/ broken NCQ */ if (blk_queue_depth(disk->queue) == 1) return AUTOP_SSD_QD1; /* use one of the normal ssd sets */ if (idx < AUTOP_SSD_DFL) return AUTOP_SSD_DFL; /* if user is overriding anything, maintain what was there */ if (ioc->user_qos_params || ioc->user_cost_model) return idx; /* step up/down based on the vrate */ vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); now_ns = blk_time_get_ns(); if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { if (!ioc->autop_too_fast_at) ioc->autop_too_fast_at = now_ns; if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC) return idx + 1; } else { ioc->autop_too_fast_at = 0; } if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) { if (!ioc->autop_too_slow_at) ioc->autop_too_slow_at = now_ns; if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC) return idx - 1; } else { ioc->autop_too_slow_at = 0; } return idx; } /* * Take the followings as input * * @bps maximum sequential throughput * @seqiops maximum sequential 4k iops * @randiops maximum random 4k iops * * and calculate the linear model cost coefficients. * * *@page per-page cost 1s / (@bps / 4096) * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0) * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0) */ static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops, u64 *page, u64 *seqio, u64 *randio) { u64 v; *page = *seqio = *randio = 0; if (bps) { u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE); if (bps_pages) *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages); else *page = 1; } if (seqiops) { v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops); if (v > *page) *seqio = v - *page; } if (randiops) { v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops); if (v > *page) *randio = v - *page; } } static void ioc_refresh_lcoefs(struct ioc *ioc) { u64 *u = ioc->params.i_lcoefs; u64 *c = ioc->params.lcoefs; calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]); calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS], &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]); } /* * struct gendisk is required as an argument because ioc->rqos.disk * is not properly initialized when called from the init path. */ static bool ioc_refresh_params_disk(struct ioc *ioc, bool force, struct gendisk *disk) { const struct ioc_params *p; int idx; lockdep_assert_held(&ioc->lock); idx = ioc_autop_idx(ioc, disk); p = &autop[idx]; if (idx == ioc->autop_idx && !force) return false; if (idx != ioc->autop_idx) { atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); ioc->vtime_base_rate = VTIME_PER_USEC; } ioc->autop_idx = idx; ioc->autop_too_fast_at = 0; ioc->autop_too_slow_at = 0; if (!ioc->user_qos_params) memcpy(ioc->params.qos, p->qos, sizeof(p->qos)); if (!ioc->user_cost_model) memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs)); ioc_refresh_period_us(ioc); ioc_refresh_lcoefs(ioc); ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] * VTIME_PER_USEC, MILLION); ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] * VTIME_PER_USEC, MILLION); return true; } static bool ioc_refresh_params(struct ioc *ioc, bool force) { return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk); } /* * When an iocg accumulates too much vtime or gets deactivated, we throw away * some vtime, which lowers the overall device utilization. As the exact amount * which is being thrown away is known, we can compensate by accelerating the * vrate accordingly so that the extra vtime generated in the current period * matches what got lost. */ static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now) { s64 pleft = ioc->period_at + ioc->period_us - now->now; s64 vperiod = ioc->period_us * ioc->vtime_base_rate; s64 vcomp, vcomp_min, vcomp_max; lockdep_assert_held(&ioc->lock); /* we need some time left in this period */ if (pleft <= 0) goto done; /* * Calculate how much vrate should be adjusted to offset the error. * Limit the amount of adjustment and deduct the adjusted amount from * the error. */ vcomp = -div64_s64(ioc->vtime_err, pleft); vcomp_min = -(ioc->vtime_base_rate >> 1); vcomp_max = ioc->vtime_base_rate; vcomp = clamp(vcomp, vcomp_min, vcomp_max); ioc->vtime_err += vcomp * pleft; atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp); done: /* bound how much error can accumulate */ ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); } static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, int nr_lagging, int nr_shortages, int prev_busy_level, u32 *missed_ppm) { u64 vrate = ioc->vtime_base_rate; u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { if (ioc->busy_level != prev_busy_level || nr_lagging) trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages); return; } /* * If vrate is out of bounds, apply clamp gradually as the * bounds can change abruptly. Otherwise, apply busy_level * based adjustment. */ if (vrate < vrate_min) { vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100); vrate = min(vrate, vrate_min); } else if (vrate > vrate_max) { vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100); vrate = max(vrate, vrate_max); } else { int idx = min_t(int, abs(ioc->busy_level), ARRAY_SIZE(vrate_adj_pct) - 1); u32 adj_pct = vrate_adj_pct[idx]; if (ioc->busy_level > 0) adj_pct = 100 - adj_pct; else adj_pct = 100 + adj_pct; vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), vrate_min, vrate_max); } trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages); ioc->vtime_base_rate = vrate; ioc_refresh_margins(ioc); } /* take a snapshot of the current [v]time and vrate */ static void ioc_now(struct ioc *ioc, struct ioc_now *now) { unsigned seq; u64 vrate; now->now_ns = blk_time_get_ns(); now->now = ktime_to_us(now->now_ns); vrate = atomic64_read(&ioc->vtime_rate); /* * The current vtime is * * vtime at period start + (wallclock time since the start) * vrate * * As a consistent snapshot of `period_at_vtime` and `period_at` is * needed, they're seqcount protected. */ do { seq = read_seqcount_begin(&ioc->period_seqcount); now->vnow = ioc->period_at_vtime + (now->now - ioc->period_at) * vrate; } while (read_seqcount_retry(&ioc->period_seqcount, seq)); } static void ioc_start_period(struct ioc *ioc, struct ioc_now *now) { WARN_ON_ONCE(ioc->running != IOC_RUNNING); write_seqcount_begin(&ioc->period_seqcount); ioc->period_at = now->now; ioc->period_at_vtime = now->vnow; write_seqcount_end(&ioc->period_seqcount); ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us); add_timer(&ioc->timer); } /* * Update @iocg's `active` and `inuse` to @active and @inuse, update level * weight sums and propagate upwards accordingly. If @save, the current margin * is saved to be used as reference for later inuse in-period adjustments. */ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, bool save, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; int lvl; lockdep_assert_held(&ioc->lock); /* * For an active leaf node, its inuse shouldn't be zero or exceed * @active. An active internal node's inuse is solely determined by the * inuse to active ratio of its children regardless of @inuse. */ if (list_empty(&iocg->active_list) && iocg->child_active_sum) { inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, iocg->child_active_sum); } else { inuse = clamp_t(u32, inuse, 1, active); } iocg->last_inuse = iocg->inuse; if (save) iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime); if (active == iocg->active && inuse == iocg->inuse) return; for (lvl = iocg->level - 1; lvl >= 0; lvl--) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; u32 parent_active = 0, parent_inuse = 0; /* update the level sums */ parent->child_active_sum += (s32)(active - child->active); parent->child_inuse_sum += (s32)(inuse - child->inuse); /* apply the updates */ child->active = active; child->inuse = inuse; /* * The delta between inuse and active sums indicates that * much of weight is being given away. Parent's inuse * and active should reflect the ratio. */ if (parent->child_active_sum) { parent_active = parent->weight; parent_inuse = DIV64_U64_ROUND_UP( parent_active * parent->child_inuse_sum, parent->child_active_sum); } /* do we need to keep walking up? */ if (parent_active == parent->active && parent_inuse == parent->inuse) break; active = parent_active; inuse = parent_inuse; } ioc->weights_updated = true; } static void commit_weights(struct ioc *ioc) { lockdep_assert_held(&ioc->lock); if (ioc->weights_updated) { /* paired with rmb in current_hweight(), see there */ smp_wmb(); atomic_inc(&ioc->hweight_gen); ioc->weights_updated = false; } } static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, bool save, struct ioc_now *now) { __propagate_weights(iocg, active, inuse, save, now); commit_weights(iocg->ioc); } static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep) { struct ioc *ioc = iocg->ioc; int lvl; u32 hwa, hwi; int ioc_gen; /* hot path - if uptodate, use cached */ ioc_gen = atomic_read(&ioc->hweight_gen); if (ioc_gen == iocg->hweight_gen) goto out; /* * Paired with wmb in commit_weights(). If we saw the updated * hweight_gen, all the weight updates from __propagate_weights() are * visible too. * * We can race with weight updates during calculation and get it * wrong. However, hweight_gen would have changed and a future * reader will recalculate and we're guaranteed to discard the * wrong result soon. */ smp_rmb(); hwa = hwi = WEIGHT_ONE; for (lvl = 0; lvl <= iocg->level - 1; lvl++) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; u64 active_sum = READ_ONCE(parent->child_active_sum); u64 inuse_sum = READ_ONCE(parent->child_inuse_sum); u32 active = READ_ONCE(child->active); u32 inuse = READ_ONCE(child->inuse); /* we can race with deactivations and either may read as zero */ if (!active_sum || !inuse_sum) continue; active_sum = max_t(u64, active, active_sum); hwa = div64_u64((u64)hwa * active, active_sum); inuse_sum = max_t(u64, inuse, inuse_sum); hwi = div64_u64((u64)hwi * inuse, inuse_sum); } iocg->hweight_active = max_t(u32, hwa, 1); iocg->hweight_inuse = max_t(u32, hwi, 1); iocg->hweight_gen = ioc_gen; out: if (hw_activep) *hw_activep = iocg->hweight_active; if (hw_inusep) *hw_inusep = iocg->hweight_inuse; } /* * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the * other weights stay unchanged. */ static u32 current_hweight_max(struct ioc_gq *iocg) { u32 hwm = WEIGHT_ONE; u32 inuse = iocg->active; u64 child_inuse_sum; int lvl; lockdep_assert_held(&iocg->ioc->lock); for (lvl = iocg->level - 1; lvl >= 0; lvl--) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse; hwm = div64_u64((u64)hwm * inuse, child_inuse_sum); inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum, parent->child_active_sum); } return max_t(u32, hwm, 1); } static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg); u32 weight; lockdep_assert_held(&ioc->lock); weight = iocg->cfg_weight ?: iocc->dfl_weight; if (weight != iocg->weight && iocg->active) propagate_weights(iocg, weight, iocg->inuse, true, now); iocg->weight = weight; } static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 __maybe_unused last_period, cur_period; u64 vtime, vtarget; int i; /* * If seem to be already active, just update the stamp to tell the * timer that we're still active. We don't mind occassional races. */ if (!list_empty(&iocg->active_list)) { ioc_now(ioc, now); cur_period = atomic64_read(&ioc->cur_period); if (atomic64_read(&iocg->active_period) != cur_period) atomic64_set(&iocg->active_period, cur_period); return true; } /* racy check on internal node IOs, treat as root level IOs */ if (iocg->child_active_sum) return false; spin_lock_irq(&ioc->lock); ioc_now(ioc, now); /* update period */ cur_period = atomic64_read(&ioc->cur_period); last_period = atomic64_read(&iocg->active_period); atomic64_set(&iocg->active_period, cur_period); /* already activated or breaking leaf-only constraint? */ if (!list_empty(&iocg->active_list)) goto succeed_unlock; for (i = iocg->level - 1; i > 0; i--) if (!list_empty(&iocg->ancestors[i]->active_list)) goto fail_unlock; if (iocg->child_active_sum) goto fail_unlock; /* * Always start with the target budget. On deactivation, we throw away * anything above it. */ vtarget = now->vnow - ioc->margins.target; vtime = atomic64_read(&iocg->vtime); atomic64_add(vtarget - vtime, &iocg->vtime); atomic64_add(vtarget - vtime, &iocg->done_vtime); vtime = vtarget; /* * Activate, propagate weight and start period timer if not * running. Reset hweight_gen to avoid accidental match from * wrapping. */ iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; list_add(&iocg->active_list, &ioc->active_iocgs); propagate_weights(iocg, iocg->weight, iocg->last_inuse ?: iocg->weight, true, now); TRACE_IOCG_PATH(iocg_activate, iocg, now, last_period, cur_period, vtime); iocg->activated_at = now->now; if (ioc->running == IOC_IDLE) { ioc->running = IOC_RUNNING; ioc->dfgv_period_at = now->now; ioc->dfgv_period_rem = 0; ioc_start_period(ioc, now); } succeed_unlock: spin_unlock_irq(&ioc->lock); return true; fail_unlock: spin_unlock_irq(&ioc->lock); return false; } static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); u64 tdelta, delay, new_delay, shift; s64 vover, vover_pct; u32 hwa; lockdep_assert_held(&iocg->waitq.lock); /* * If the delay is set by another CPU, we may be in the past. No need to * change anything if so. This avoids decay calculation underflow. */ if (time_before64(now->now, iocg->delay_at)) return false; /* calculate the current delay in effect - 1/2 every second */ tdelta = now->now - iocg->delay_at; shift = div64_u64(tdelta, USEC_PER_SEC); if (iocg->delay && shift < BITS_PER_LONG) delay = iocg->delay >> shift; else delay = 0; /* calculate the new delay from the debt amount */ current_hweight(iocg, &hwa, NULL); vover = atomic64_read(&iocg->vtime) + abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow; vover_pct = div64_s64(100 * vover, ioc->period_us * ioc->vtime_base_rate); if (vover_pct <= MIN_DELAY_THR_PCT) new_delay = 0; else if (vover_pct >= MAX_DELAY_THR_PCT) new_delay = MAX_DELAY; else new_delay = MIN_DELAY + div_u64((MAX_DELAY - MIN_DELAY) * (vover_pct - MIN_DELAY_THR_PCT), MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT); /* pick the higher one and apply */ if (new_delay > delay) { iocg->delay = new_delay; iocg->delay_at = now->now; delay = new_delay; } if (delay >= MIN_DELAY) { if (!iocg->indelay_since) iocg->indelay_since = now->now; blkcg_set_delay(blkg, delay * NSEC_PER_USEC); return true; } else { if (iocg->indelay_since) { iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = 0; } iocg->delay = 0; blkcg_clear_delay(blkg); return false; } } static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost, struct ioc_now *now) { struct iocg_pcpu_stat *gcs; lockdep_assert_held(&iocg->ioc->lock); lockdep_assert_held(&iocg->waitq.lock); WARN_ON_ONCE(list_empty(&iocg->active_list)); /* * Once in debt, debt handling owns inuse. @iocg stays at the minimum * inuse donating all of it share to others until its debt is paid off. */ if (!iocg->abs_vdebt && abs_cost) { iocg->indebt_since = now->now; propagate_weights(iocg, iocg->active, 0, false, now); } iocg->abs_vdebt += abs_cost; gcs = get_cpu_ptr(iocg->pcpu_stat); local64_add(abs_cost, &gcs->abs_vusage); put_cpu_ptr(gcs); } static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, struct ioc_now *now) { lockdep_assert_held(&iocg->ioc->lock); lockdep_assert_held(&iocg->waitq.lock); /* * make sure that nobody messed with @iocg. Check iocg->pd.online * to avoid warn when removing blkcg or disk. */ WARN_ON_ONCE(list_empty(&iocg->active_list) && iocg->pd.online); WARN_ON_ONCE(iocg->inuse > 1); iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); /* if debt is paid in full, restore inuse */ if (!iocg->abs_vdebt) { iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = 0; propagate_weights(iocg, iocg->active, iocg->last_inuse, false, now); } } static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key) { struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait); struct iocg_wake_ctx *ctx = key; u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); ctx->vbudget -= cost; if (ctx->vbudget < 0) return -1; iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); wait->committed = true; /* * autoremove_wake_function() removes the wait entry only when it * actually changed the task state. We want the wait always removed. * Remove explicitly and use default_wake_function(). Note that the * order of operations is important as finish_wait() tests whether * @wq_entry is removed without grabbing the lock. */ default_wake_function(wq_entry, mode, flags, key); list_del_init_careful(&wq_entry->entry); return 0; } /* * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in * addition to iocg->waitq.lock. */ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct iocg_wake_ctx ctx = { .iocg = iocg }; u64 vshortage, expires, oexpires; s64 vbudget; u32 hwa; lockdep_assert_held(&iocg->waitq.lock); current_hweight(iocg, &hwa, NULL); vbudget = now->vnow - atomic64_read(&iocg->vtime); /* pay off debt */ if (pay_debt && iocg->abs_vdebt && vbudget > 0) { u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa); u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt); u64 vpay = abs_cost_to_cost(abs_vpay, hwa); lockdep_assert_held(&ioc->lock); atomic64_add(vpay, &iocg->vtime); atomic64_add(vpay, &iocg->done_vtime); iocg_pay_debt(iocg, abs_vpay, now); vbudget -= vpay; } if (iocg->abs_vdebt || iocg->delay) iocg_kick_delay(iocg, now); /* * Debt can still be outstanding if we haven't paid all yet or the * caller raced and called without @pay_debt. Shouldn't wake up waiters * under debt. Make sure @vbudget reflects the outstanding amount and is * not positive. */ if (iocg->abs_vdebt) { s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa); vbudget = min_t(s64, 0, vbudget - vdebt); } /* * Wake up the ones which are due and see how much vtime we'll need for * the next one. As paying off debt restores hw_inuse, it must be read * after the above debt payment. */ ctx.vbudget = vbudget; current_hweight(iocg, NULL, &ctx.hw_inuse); __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); if (!waitqueue_active(&iocg->waitq)) { if (iocg->wait_since) { iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = 0; } return; } if (!iocg->wait_since) iocg->wait_since = now->now; if (WARN_ON_ONCE(ctx.vbudget >= 0)) return; /* determine next wakeup, add a timer margin to guarantee chunking */ vshortage = -ctx.vbudget; expires = now->now_ns + DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) * NSEC_PER_USEC; expires += ioc->timer_slack_ns; /* if already active and close enough, don't bother */ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); if (hrtimer_is_queued(&iocg->waitq_timer) && abs(oexpires - expires) <= ioc->timer_slack_ns) return; hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), ioc->timer_slack_ns, HRTIMER_MODE_ABS); } static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) { struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer); bool pay_debt = READ_ONCE(iocg->abs_vdebt); struct ioc_now now; unsigned long flags; ioc_now(iocg->ioc, &now); iocg_lock(iocg, pay_debt, &flags); iocg_kick_waitq(iocg, pay_debt, &now); iocg_unlock(iocg, pay_debt, &flags); return HRTIMER_NORESTART; } static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p) { u32 nr_met[2] = { }; u32 nr_missed[2] = { }; u64 rq_wait_ns = 0; int cpu, rw; for_each_online_cpu(cpu) { struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu); u64 this_rq_wait_ns; for (rw = READ; rw <= WRITE; rw++) { u32 this_met = local_read(&stat->missed[rw].nr_met); u32 this_missed = local_read(&stat->missed[rw].nr_missed); nr_met[rw] += this_met - stat->missed[rw].last_met; nr_missed[rw] += this_missed - stat->missed[rw].last_missed; stat->missed[rw].last_met = this_met; stat->missed[rw].last_missed = this_missed; } this_rq_wait_ns = local64_read(&stat->rq_wait_ns); rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; stat->last_rq_wait_ns = this_rq_wait_ns; } for (rw = READ; rw <= WRITE; rw++) { if (nr_met[rw] + nr_missed[rw]) missed_ppm_ar[rw] = DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION, nr_met[rw] + nr_missed[rw]); else missed_ppm_ar[rw] = 0; } *rq_wait_pct_p = div64_u64(rq_wait_ns * 100, ioc->period_us * NSEC_PER_USEC); } /* was iocg idle this period? */ static bool iocg_is_idle(struct ioc_gq *iocg) { struct ioc *ioc = iocg->ioc; /* did something get issued this period? */ if (atomic64_read(&iocg->active_period) == atomic64_read(&ioc->cur_period)) return false; /* is something in flight? */ if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime)) return false; return true; } /* * Call this function on the target leaf @iocg's to build pre-order traversal * list of all the ancestors in @inner_walk. The inner nodes are linked through * ->walk_list and the caller is responsible for dissolving the list after use. */ static void iocg_build_inner_walk(struct ioc_gq *iocg, struct list_head *inner_walk) { int lvl; WARN_ON_ONCE(!list_empty(&iocg->walk_list)); /* find the first ancestor which hasn't been visited yet */ for (lvl = iocg->level - 1; lvl >= 0; lvl--) { if (!list_empty(&iocg->ancestors[lvl]->walk_list)) break; } /* walk down and visit the inner nodes to get pre-order traversal */ while (++lvl <= iocg->level - 1) { struct ioc_gq *inner = iocg->ancestors[lvl]; /* record traversal order */ list_add_tail(&inner->walk_list, inner_walk); } } /* propagate the deltas to the parent */ static void iocg_flush_stat_upward(struct ioc_gq *iocg) { if (iocg->level > 0) { struct iocg_stat *parent_stat = &iocg->ancestors[iocg->level - 1]->stat; parent_stat->usage_us += iocg->stat.usage_us - iocg->last_stat.usage_us; parent_stat->wait_us += iocg->stat.wait_us - iocg->last_stat.wait_us; parent_stat->indebt_us += iocg->stat.indebt_us - iocg->last_stat.indebt_us; parent_stat->indelay_us += iocg->stat.indelay_us - iocg->last_stat.indelay_us; } iocg->last_stat = iocg->stat; } /* collect per-cpu counters and propagate the deltas to the parent */ static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 abs_vusage = 0; u64 vusage_delta; int cpu; lockdep_assert_held(&iocg->ioc->lock); /* collect per-cpu counters */ for_each_possible_cpu(cpu) { abs_vusage += local64_read( per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu)); } vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; iocg->last_stat_abs_vusage = abs_vusage; iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); iocg->stat.usage_us += iocg->usage_delta_us; iocg_flush_stat_upward(iocg); } /* get stat counters ready for reading on all active iocgs */ static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) { LIST_HEAD(inner_walk); struct ioc_gq *iocg, *tiocg; /* flush leaves and build inner node walk list */ list_for_each_entry(iocg, target_iocgs, active_list) { iocg_flush_stat_leaf(iocg, now); iocg_build_inner_walk(iocg, &inner_walk); } /* keep flushing upwards by walking the inner list backwards */ list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) { iocg_flush_stat_upward(iocg); list_del_init(&iocg->walk_list); } } /* * Determine what @iocg's hweight_inuse should be after donating unused * capacity. @hwm is the upper bound and used to signal no donation. This * function also throws away @iocg's excess budget. */ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm, u32 usage, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 vtime = atomic64_read(&iocg->vtime); s64 excess, delta, target, new_hwi; /* debt handling owns inuse for debtors */ if (iocg->abs_vdebt) return 1; /* see whether minimum margin requirement is met */ if (waitqueue_active(&iocg->waitq) || time_after64(vtime, now->vnow - ioc->margins.min)) return hwm; /* throw away excess above target */ excess = now->vnow - vtime - ioc->margins.target; if (excess > 0) { atomic64_add(excess, &iocg->vtime); atomic64_add(excess, &iocg->done_vtime); vtime += excess; ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); } /* * Let's say the distance between iocg's and device's vtimes as a * fraction of period duration is delta. Assuming that the iocg will * consume the usage determined above, we want to determine new_hwi so * that delta equals MARGIN_TARGET at the end of the next period. * * We need to execute usage worth of IOs while spending the sum of the * new budget (1 - MARGIN_TARGET) and the leftover from the last period * (delta): * * usage = (1 - MARGIN_TARGET + delta) * new_hwi * * Therefore, the new_hwi is: * * new_hwi = usage / (1 - MARGIN_TARGET + delta) */ delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime), now->vnow - ioc->period_at_vtime); target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100; new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta); return clamp_t(s64, new_hwi, 1, hwm); } /* * For work-conservation, an iocg which isn't using all of its share should * donate the leftover to other iocgs. There are two ways to achieve this - 1. * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight. * * #1 is mathematically simpler but has the drawback of requiring synchronous * global hweight_inuse updates when idle iocg's get activated or inuse weights * change due to donation snapbacks as it has the possibility of grossly * overshooting what's allowed by the model and vrate. * * #2 is inherently safe with local operations. The donating iocg can easily * snap back to higher weights when needed without worrying about impacts on * other nodes as the impacts will be inherently correct. This also makes idle * iocg activations safe. The only effect activations have is decreasing * hweight_inuse of others, the right solution to which is for those iocgs to * snap back to higher weights. * * So, we go with #2. The challenge is calculating how each donating iocg's * inuse should be adjusted to achieve the target donation amounts. This is done * using Andy's method described in the following pdf. * * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo * * Given the weights and target after-donation hweight_inuse values, Andy's * method determines how the proportional distribution should look like at each * sibling level to maintain the relative relationship between all non-donating * pairs. To roughly summarize, it divides the tree into donating and * non-donating parts, calculates global donation rate which is used to * determine the target hweight_inuse for each node, and then derives per-level * proportions. * * The following pdf shows that global distribution calculated this way can be * achieved by scaling inuse weights of donating leaves and propagating the * adjustments upwards proportionally. * * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE * * Combining the above two, we can determine how each leaf iocg's inuse should * be adjusted to achieve the target donation. * * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN * * The inline comments use symbols from the last pdf. * * b is the sum of the absolute budgets in the subtree. 1 for the root node. * f is the sum of the absolute budgets of non-donating nodes in the subtree. * t is the sum of the absolute budgets of donating nodes in the subtree. * w is the weight of the node. w = w_f + w_t * w_f is the non-donating portion of w. w_f = w * f / b * w_b is the donating portion of w. w_t = w * t / b * s is the sum of all sibling weights. s = Sum(w) for siblings * s_f and s_t are the non-donating and donating portions of s. * * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g. * w_pt is the donating portion of the parent's weight and w'_pt the same value * after adjustments. Subscript r denotes the root node's values. */ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) { LIST_HEAD(over_hwa); LIST_HEAD(inner_walk); struct ioc_gq *iocg, *tiocg, *root_iocg; u32 after_sum, over_sum, over_target, gamma; /* * It's pretty unlikely but possible for the total sum of * hweight_after_donation's to be higher than WEIGHT_ONE, which will * confuse the following calculations. If such condition is detected, * scale down everyone over its full share equally to keep the sum below * WEIGHT_ONE. */ after_sum = 0; over_sum = 0; list_for_each_entry(iocg, surpluses, surplus_list) { u32 hwa; current_hweight(iocg, &hwa, NULL); after_sum += iocg->hweight_after_donation; if (iocg->hweight_after_donation > hwa) { over_sum += iocg->hweight_after_donation; list_add(&iocg->walk_list, &over_hwa); } } if (after_sum >= WEIGHT_ONE) { /* * The delta should be deducted from the over_sum, calculate * target over_sum value. */ u32 over_delta = after_sum - (WEIGHT_ONE - 1); WARN_ON_ONCE(over_sum <= over_delta); over_target = over_sum - over_delta; } else { over_target = 0; } list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) { if (over_target) iocg->hweight_after_donation = div_u64((u64)iocg->hweight_after_donation * over_target, over_sum); list_del_init(&iocg->walk_list); } /* * Build pre-order inner node walk list and prepare for donation * adjustment calculations. */ list_for_each_entry(iocg, surpluses, surplus_list) { iocg_build_inner_walk(iocg, &inner_walk); } root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list); WARN_ON_ONCE(root_iocg->level > 0); list_for_each_entry(iocg, &inner_walk, walk_list) { iocg->child_adjusted_sum = 0; iocg->hweight_donating = 0; iocg->hweight_after_donation = 0; } /* * Propagate the donating budget (b_t) and after donation budget (b'_t) * up the hierarchy. */ list_for_each_entry(iocg, surpluses, surplus_list) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; parent->hweight_donating += iocg->hweight_donating; parent->hweight_after_donation += iocg->hweight_after_donation; } list_for_each_entry_reverse(iocg, &inner_walk, walk_list) { if (iocg->level > 0) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; parent->hweight_donating += iocg->hweight_donating; parent->hweight_after_donation += iocg->hweight_after_donation; } } /* * Calculate inner hwa's (b) and make sure the donation values are * within the accepted ranges as we're doing low res calculations with * roundups. */ list_for_each_entry(iocg, &inner_walk, walk_list) { if (iocg->level) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; iocg->hweight_active = DIV64_U64_ROUND_UP( (u64)parent->hweight_active * iocg->active, parent->child_active_sum); } iocg->hweight_donating = min(iocg->hweight_donating, iocg->hweight_active); iocg->hweight_after_donation = min(iocg->hweight_after_donation, iocg->hweight_donating - 1); if (WARN_ON_ONCE(iocg->hweight_active <= 1 || iocg->hweight_donating <= 1 || iocg->hweight_after_donation == 0)) { pr_warn("iocg: invalid donation weights in "); pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup); pr_cont(": active=%u donating=%u after=%u\n", iocg->hweight_active, iocg->hweight_donating, iocg->hweight_after_donation); } } /* * Calculate the global donation rate (gamma) - the rate to adjust * non-donating budgets by. * * No need to use 64bit multiplication here as the first operand is * guaranteed to be smaller than WEIGHT_ONE (1<<16). * * We know that there are beneficiary nodes and the sum of the donating * hweights can't be whole; however, due to the round-ups during hweight * calculations, root_iocg->hweight_donating might still end up equal to * or greater than whole. Limit the range when calculating the divider. * * gamma = (1 - t_r') / (1 - t_r) */ gamma = DIV_ROUND_UP( (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE, WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1)); /* * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner * nodes. */ list_for_each_entry(iocg, &inner_walk, walk_list) { struct ioc_gq *parent; u32 inuse, wpt, wptp; u64 st, sf; if (iocg->level == 0) { /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */ iocg->child_adjusted_sum = DIV64_U64_ROUND_UP( iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating), WEIGHT_ONE - iocg->hweight_after_donation); continue; } parent = iocg->ancestors[iocg->level - 1]; /* b' = gamma * b_f + b_t' */ iocg->hweight_inuse = DIV64_U64_ROUND_UP( (u64)gamma * (iocg->hweight_active - iocg->hweight_donating), WEIGHT_ONE) + iocg->hweight_after_donation; /* w' = s' * b' / b'_p */ inuse = DIV64_U64_ROUND_UP( (u64)parent->child_adjusted_sum * iocg->hweight_inuse, parent->hweight_inuse); /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */ st = DIV64_U64_ROUND_UP( iocg->child_active_sum * iocg->hweight_donating, iocg->hweight_active); sf = iocg->child_active_sum - st; wpt = DIV64_U64_ROUND_UP( (u64)iocg->active * iocg->hweight_donating, iocg->hweight_active); wptp = DIV64_U64_ROUND_UP( (u64)inuse * iocg->hweight_after_donation, iocg->hweight_inuse); iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt); } /* * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and * we can finally determine leaf adjustments. */ list_for_each_entry(iocg, surpluses, surplus_list) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; u32 inuse; /* * In-debt iocgs participated in the donation calculation with * the minimum target hweight_inuse. Configuring inuse * accordingly would work fine but debt handling expects * @iocg->inuse stay at the minimum and we don't wanna * interfere. */ if (iocg->abs_vdebt) { WARN_ON_ONCE(iocg->inuse > 1); continue; } /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */ inuse = DIV64_U64_ROUND_UP( parent->child_adjusted_sum * iocg->hweight_after_donation, parent->hweight_inuse); TRACE_IOCG_PATH(inuse_transfer, iocg, now, iocg->inuse, inuse, iocg->hweight_inuse, iocg->hweight_after_donation); __propagate_weights(iocg, iocg->active, inuse, true, now); } /* walk list should be dissolved after use */ list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list) list_del_init(&iocg->walk_list); } /* * A low weight iocg can amass a large amount of debt, for example, when * anonymous memory gets reclaimed aggressively. If the system has a lot of * memory paired with a slow IO device, the debt can span multiple seconds or * more. If there are no other subsequent IO issuers, the in-debt iocg may end * up blocked paying its debt while the IO device is idle. * * The following protects against such cases. If the device has been * sufficiently idle for a while, the debts are halved and delays are * recalculated. */ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, struct ioc_now *now) { struct ioc_gq *iocg; u64 dur, usage_pct, nr_cycles; /* if no debtor, reset the cycle */ if (!nr_debtors) { ioc->dfgv_period_at = now->now; ioc->dfgv_period_rem = 0; ioc->dfgv_usage_us_sum = 0; return; } /* * Debtors can pass through a lot of writes choking the device and we * don't want to be forgiving debts while the device is struggling from * write bursts. If we're missing latency targets, consider the device * fully utilized. */ if (ioc->busy_level > 0) usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us); ioc->dfgv_usage_us_sum += usage_us_sum; if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD)) return; /* * At least DFGV_PERIOD has passed since the last period. Calculate the * average usage and reset the period counters. */ dur = now->now - ioc->dfgv_period_at; usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur); ioc->dfgv_period_at = now->now; ioc->dfgv_usage_us_sum = 0; /* if was too busy, reset everything */ if (usage_pct > DFGV_USAGE_PCT) { ioc->dfgv_period_rem = 0; return; } /* * Usage is lower than threshold. Let's forgive some debts. Debt * forgiveness runs off of the usual ioc timer but its period usually * doesn't match ioc's. Compensate the difference by performing the * reduction as many times as would fit in the duration since the last * run and carrying over the left-over duration in @ioc->dfgv_period_rem * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive * reductions is doubled. */ nr_cycles = dur + ioc->dfgv_period_rem; ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD); list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { u64 __maybe_unused old_debt, __maybe_unused old_delay; if (!iocg->abs_vdebt && !iocg->delay) continue; spin_lock(&iocg->waitq.lock); old_debt = iocg->abs_vdebt; old_delay = iocg->delay; if (iocg->abs_vdebt) iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1; if (iocg->delay) iocg->delay = iocg->delay >> nr_cycles ?: 1; iocg_kick_waitq(iocg, true, now); TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct, old_debt, iocg->abs_vdebt, old_delay, iocg->delay); spin_unlock(&iocg->waitq.lock); } } /* * Check the active iocgs' state to avoid oversleeping and deactive * idle iocgs. * * Since waiters determine the sleep durations based on the vrate * they saw at the time of sleep, if vrate has increased, some * waiters could be sleeping for too long. Wake up tardy waiters * which should have woken up in the last period and expire idle * iocgs. */ static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) { int nr_debtors = 0; struct ioc_gq *iocg, *tiocg; list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && !iocg->delay && !iocg_is_idle(iocg)) continue; spin_lock(&iocg->waitq.lock); /* flush wait and indebt stat deltas */ if (iocg->wait_since) { iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = now->now; } if (iocg->indebt_since) { iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = now->now; } if (iocg->indelay_since) { iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = now->now; } if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || iocg->delay) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, true, now); if (iocg->abs_vdebt || iocg->delay) nr_debtors++; } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ u64 vtime = atomic64_read(&iocg->vtime); s64 excess; /* * @iocg has been inactive for a full duration and will * have a high budget. Account anything above target as * error and throw away. On reactivation, it'll start * with the target budget. */ excess = now->vnow - vtime - ioc->margins.target; if (excess > 0) { u32 old_hwi; current_hweight(iocg, NULL, &old_hwi); ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); } TRACE_IOCG_PATH(iocg_idle, iocg, now, atomic64_read(&iocg->active_period), atomic64_read(&ioc->cur_period), vtime); __propagate_weights(iocg, 0, 0, false, now); list_del_init(&iocg->active_list); } spin_unlock(&iocg->waitq.lock); } commit_weights(ioc); return nr_debtors; } static void ioc_timer_fn(struct timer_list *timer) { struct ioc *ioc = container_of(timer, struct ioc, timer); struct ioc_gq *iocg, *tiocg; struct ioc_now now; LIST_HEAD(surpluses); int nr_debtors, nr_shortages = 0, nr_lagging = 0; u64 usage_us_sum = 0; u32 ppm_rthr; u32 ppm_wthr; u32 missed_ppm[2], rq_wait_pct; u64 period_vtime; int prev_busy_level; /* how were the latencies during the period? */ ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); /* take care of active iocgs */ spin_lock_irq(&ioc->lock); ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; ioc_now(ioc, &now); period_vtime = now.vnow - ioc->period_at_vtime; if (WARN_ON_ONCE(!period_vtime)) { spin_unlock_irq(&ioc->lock); return; } nr_debtors = ioc_check_iocgs(ioc, &now); /* * Wait and indebt stat are flushed above and the donation calculation * below needs updated usage stat. Let's bring stat up-to-date. */ iocg_flush_stat(&ioc->active_iocgs, &now); /* calc usage and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { u64 vdone, vtime, usage_us; u32 hw_active, hw_inuse; /* * Collect unused and wind vtime closer to vnow to prevent * iocgs from accumulating a large amount of budget. */ vdone = atomic64_read(&iocg->done_vtime); vtime = atomic64_read(&iocg->vtime); current_hweight(iocg, &hw_active, &hw_inuse); /* * Latency QoS detection doesn't account for IOs which are * in-flight for longer than a period. Detect them by * comparing vdone against period start. If lagging behind * IOs from past periods, don't increase vrate. */ if ((ppm_rthr != MILLION || ppm_wthr != MILLION) && !atomic_read(&iocg_to_blkg(iocg)->use_delay) && time_after64(vtime, vdone) && time_after64(vtime, now.vnow - MAX_LAGGING_PERIODS * period_vtime) && time_before64(vdone, now.vnow - period_vtime)) nr_lagging++; /* * Determine absolute usage factoring in in-flight IOs to avoid * high-latency completions appearing as idle. */ usage_us = iocg->usage_delta_us; usage_us_sum += usage_us; /* see whether there's surplus vtime */ WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow - ioc->margins.low))) { u32 hwa, old_hwi, hwm, new_hwi, usage; u64 usage_dur; if (vdone != vtime) { u64 inflight_us = DIV64_U64_ROUND_UP( cost_to_abs_cost(vtime - vdone, hw_inuse), ioc->vtime_base_rate); usage_us = max(usage_us, inflight_us); } /* convert to hweight based usage ratio */ if (time_after64(iocg->activated_at, ioc->period_at)) usage_dur = max_t(u64, now.now - iocg->activated_at, 1); else usage_dur = max_t(u64, now.now - ioc->period_at, 1); usage = clamp_t(u32, DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, usage_dur), 1, WEIGHT_ONE); /* * Already donating or accumulated enough to start. * Determine the donation amount. */ current_hweight(iocg, &hwa, &old_hwi); hwm = current_hweight_max(iocg); new_hwi = hweight_after_donation(iocg, old_hwi, hwm, usage, &now); /* * Donation calculation assumes hweight_after_donation * to be positive, a condition that a donor w/ hwa < 2 * can't meet. Don't bother with donation if hwa is * below 2. It's not gonna make a meaningful difference * anyway. */ if (new_hwi < hwm && hwa >= 2) { iocg->hweight_donating = hwa; iocg->hweight_after_donation = new_hwi; list_add(&iocg->surplus_list, &surpluses); } else if (!iocg->abs_vdebt) { /* * @iocg doesn't have enough to donate. Reset * its inuse to active. * * Don't reset debtors as their inuse's are * owned by debt handling. This shouldn't affect * donation calculuation in any meaningful way * as @iocg doesn't have a meaningful amount of * share anyway. */ TRACE_IOCG_PATH(inuse_shortage, iocg, &now, iocg->inuse, iocg->active, iocg->hweight_inuse, new_hwi); __propagate_weights(iocg, iocg->active, iocg->active, true, &now); nr_shortages++; } } else { /* genuinely short on vtime */ nr_shortages++; } } if (!list_empty(&surpluses) && nr_shortages) transfer_surpluses(&surpluses, &now); commit_weights(ioc); /* surplus list should be dissolved after use */ list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list) list_del_init(&iocg->surplus_list); /* * If q is getting clogged or we're missing too much, we're issuing * too much IO and should lower vtime rate. If we're not missing * and experiencing shortages but not surpluses, we're too stingy * and should increase vtime rate. */ prev_busy_level = ioc->busy_level; if (rq_wait_pct > RQ_WAIT_BUSY_PCT || missed_ppm[READ] > ppm_rthr || missed_ppm[WRITE] > ppm_wthr) { /* clearly missing QoS targets, slow down vrate */ ioc->busy_level = max(ioc->busy_level, 0); ioc->busy_level++; } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 && missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 && missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) { /* QoS targets are being met with >25% margin */ if (nr_shortages) { /* * We're throttling while the device has spare * capacity. If vrate was being slowed down, stop. */ ioc->busy_level = min(ioc->busy_level, 0); /* * If there are IOs spanning multiple periods, wait * them out before pushing the device harder. */ if (!nr_lagging) ioc->busy_level--; } else { /* * Nobody is being throttled and the users aren't * issuing enough IOs to saturate the device. We * simply don't know how close the device is to * saturation. Coast. */ ioc->busy_level = 0; } } else { /* inside the hysterisis margin, we're good */ ioc->busy_level = 0; } ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages, prev_busy_level, missed_ppm); ioc_refresh_params(ioc, false); ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now); /* * This period is done. Move onto the next one. If nothing's * going on with the device, stop the timer. */ atomic64_inc(&ioc->cur_period); if (ioc->running != IOC_STOP) { if (!list_empty(&ioc->active_iocgs)) { ioc_start_period(ioc, &now); } else { ioc->busy_level = 0; ioc->vtime_err = 0; ioc->running = IOC_IDLE; } ioc_refresh_vrate(ioc, &now); } spin_unlock_irq(&ioc->lock); } static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, u64 abs_cost, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct ioc_margins *margins = &ioc->margins; u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi; u32 hwi, adj_step; s64 margin; u64 cost, new_inuse; unsigned long flags; current_hweight(iocg, NULL, &hwi); old_hwi = hwi; cost = abs_cost_to_cost(abs_cost, hwi); margin = now->vnow - vtime - cost; /* debt handling owns inuse for debtors */ if (iocg->abs_vdebt) return cost; /* * We only increase inuse during period and do so if the margin has * deteriorated since the previous adjustment. */ if (margin >= iocg->saved_margin || margin >= margins->low || iocg->inuse == iocg->active) return cost; spin_lock_irqsave(&ioc->lock, flags); /* we own inuse only when @iocg is in the normal active state */ if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { spin_unlock_irqrestore(&ioc->lock, flags); return cost; } /* * Bump up inuse till @abs_cost fits in the existing budget. * adj_step must be determined after acquiring ioc->lock - we might * have raced and lost to another thread for activation and could * be reading 0 iocg->active before ioc->lock which will lead to * infinite loop. */ new_inuse = iocg->inuse; adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); do { new_inuse = new_inuse + adj_step; propagate_weights(iocg, iocg->active, new_inuse, true, now); current_hweight(iocg, NULL, &hwi); cost = abs_cost_to_cost(abs_cost, hwi); } while (time_after64(vtime + cost, now->vnow) && iocg->inuse != iocg->active); spin_unlock_irqrestore(&ioc->lock, flags); TRACE_IOCG_PATH(inuse_adjust, iocg, now, old_inuse, iocg->inuse, old_hwi, hwi); return cost; } static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, bool is_merge, u64 *costp) { struct ioc *ioc = iocg->ioc; u64 coef_seqio, coef_randio, coef_page; u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1); u64 seek_pages = 0; u64 cost = 0; /* Can't calculate cost for empty bio */ if (!bio->bi_iter.bi_size) goto out; switch (bio_op(bio)) { case REQ_OP_READ: coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO]; coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO]; coef_page = ioc->params.lcoefs[LCOEF_RPAGE]; break; case REQ_OP_WRITE: coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO]; coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO]; coef_page = ioc->params.lcoefs[LCOEF_WPAGE]; break; default: goto out; } if (iocg->cursor) { seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor); seek_pages >>= IOC_SECT_TO_PAGE_SHIFT; } if (!is_merge) { if (seek_pages > LCOEF_RANDIO_PAGES) { cost += coef_randio; } else { cost += coef_seqio; } } cost += pages * coef_page; out: *costp = cost; } static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge) { u64 cost; calc_vtime_cost_builtin(bio, iocg, is_merge, &cost); return cost; } static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc, u64 *costp) { unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT; switch (req_op(rq)) { case REQ_OP_READ: *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE]; break; case REQ_OP_WRITE: *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE]; break; default: *costp = 0; } } static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc) { u64 cost; calc_size_vtime_cost_builtin(rq, ioc, &cost); return cost; } static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; struct ioc *ioc = rqos_to_ioc(rqos); struct ioc_gq *iocg = blkg_to_iocg(blkg); struct ioc_now now; struct iocg_wait wait; u64 abs_cost, cost, vtime; bool use_debt, ioc_locked; unsigned long flags; /* bypass IOs if disabled, still initializing, or for root cgroup */ if (!ioc->enabled || !iocg || !iocg->level) return; /* calculate the absolute vtime cost */ abs_cost = calc_vtime_cost(bio, iocg, false); if (!abs_cost) return; if (!iocg_activate(iocg, &now)) return; iocg->cursor = bio_end_sector(bio); vtime = atomic64_read(&iocg->vtime); cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* * If no one's waiting and within budget, issue right away. The * tests are racy but the races aren't systemic - we only miss once * in a while which is fine. */ if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && time_before_eq64(vtime + cost, now.vnow)) { iocg_commit_bio(iocg, bio, abs_cost, cost); return; } /* * We're over budget. This can be handled in two ways. IOs which may * cause priority inversions are punted to @ioc->aux_iocg and charged as * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine * whether debt handling is needed and acquire locks accordingly. */ use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current); ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); retry_lock: iocg_lock(iocg, ioc_locked, &flags); /* * @iocg must stay activated for debt and waitq handling. Deactivation * is synchronized against both ioc->lock and waitq.lock and we won't * get deactivated as long as we're waiting or has debt, so we're good * if we're activated here. In the unlikely cases that we aren't, just * issue the IO. */ if (unlikely(list_empty(&iocg->active_list))) { iocg_unlock(iocg, ioc_locked, &flags); iocg_commit_bio(iocg, bio, abs_cost, cost); return; } /* * We're over budget. If @bio has to be issued regardless, remember * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay * off the debt before waking more IOs. * * This way, the debt is continuously paid off each period with the * actual budget available to the cgroup. If we just wound vtime, we * would incorrectly use the current hw_inuse for the entire amount * which, for example, can lead to the cgroup staying blocked for a * long time even with substantially raised hw_inuse. * * An iocg with vdebt should stay online so that the timer can keep * deducting its vdebt and [de]activate use_delay mechanism * accordingly. We don't want to race against the timer trying to * clear them and leave @iocg inactive w/ dangling use_delay heavily * penalizing the cgroup and its descendants. */ if (use_debt) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); iocg_unlock(iocg, ioc_locked, &flags); return; } /* guarantee that iocgs w/ waiters have maximum inuse */ if (!iocg->abs_vdebt && iocg->inuse != iocg->active) { if (!ioc_locked) { iocg_unlock(iocg, false, &flags); ioc_locked = true; goto retry_lock; } propagate_weights(iocg, iocg->active, iocg->active, true, &now); } /* * Append self to the waitq and schedule the wakeup timer if we're * the first waiter. The timer duration is calculated based on the * current vrate. vtime and hweight changes can make it too short * or too long. Each wait entry records the absolute cost it's * waiting for to allow re-evaluation using a custom wait entry. * * If too short, the timer simply reschedules itself. If too long, * the period timer will notice and trigger wakeups. * * All waiters are on iocg->waitq and the wait states are * synchronized using waitq.lock. */ init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); wait.wait.private = current; wait.bio = bio; wait.abs_cost = abs_cost; wait.committed = false; /* will be set true by waker */ __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); iocg_kick_waitq(iocg, ioc_locked, &now); iocg_unlock(iocg, ioc_locked, &flags); while (true) { set_current_state(TASK_UNINTERRUPTIBLE); if (wait.committed) break; io_schedule(); } /* waker already committed us, proceed */ finish_wait(&iocg->waitq, &wait.wait); } static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio) { struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); struct ioc *ioc = rqos_to_ioc(rqos); sector_t bio_end = bio_end_sector(bio); struct ioc_now now; u64 vtime, abs_cost, cost; unsigned long flags; /* bypass if disabled, still initializing, or for root cgroup */ if (!ioc->enabled || !iocg || !iocg->level) return; abs_cost = calc_vtime_cost(bio, iocg, true); if (!abs_cost) return; ioc_now(ioc, &now); vtime = atomic64_read(&iocg->vtime); cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* update cursor if backmerging into the request at the cursor */ if (blk_rq_pos(rq) < bio_end && blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor) iocg->cursor = bio_end; /* * Charge if there's enough vtime budget and the existing request has * cost assigned. */ if (rq->bio && rq->bio->bi_iocost_cost && time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { iocg_commit_bio(iocg, bio, abs_cost, cost); return; } /* * Otherwise, account it as debt if @iocg is online, which it should * be for the vast majority of cases. See debt handling in * ioc_rqos_throttle() for details. */ spin_lock_irqsave(&ioc->lock, flags); spin_lock(&iocg->waitq.lock); if (likely(!list_empty(&iocg->active_list))) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); } else { iocg_commit_bio(iocg, bio, abs_cost, cost); } spin_unlock(&iocg->waitq.lock); spin_unlock_irqrestore(&ioc->lock, flags); } static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) { struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); if (iocg && bio->bi_iocost_cost) atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime); } static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) { struct ioc *ioc = rqos_to_ioc(rqos); struct ioc_pcpu_stat *ccs; u64 on_q_ns, rq_wait_ns, size_nsec; int pidx, rw; if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) return; switch (req_op(rq)) { case REQ_OP_READ: pidx = QOS_RLAT; rw = READ; break; case REQ_OP_WRITE: pidx = QOS_WLAT; rw = WRITE; break; default: return; } on_q_ns = blk_time_get_ns() - rq->alloc_time_ns; rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); ccs = get_cpu_ptr(ioc->pcpu_stat); if (on_q_ns <= size_nsec || on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) local_inc(&ccs->missed[rw].nr_met); else local_inc(&ccs->missed[rw].nr_missed); local64_add(rq_wait_ns, &ccs->rq_wait_ns); put_cpu_ptr(ccs); } static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos) { struct ioc *ioc = rqos_to_ioc(rqos); spin_lock_irq(&ioc->lock); ioc_refresh_params(ioc, false); spin_unlock_irq(&ioc->lock); } static void ioc_rqos_exit(struct rq_qos *rqos) { struct ioc *ioc = rqos_to_ioc(rqos); blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost); spin_lock_irq(&ioc->lock); ioc->running = IOC_STOP; spin_unlock_irq(&ioc->lock); timer_shutdown_sync(&ioc->timer); free_percpu(ioc->pcpu_stat); kfree(ioc); } static const struct rq_qos_ops ioc_rqos_ops = { .throttle = ioc_rqos_throttle, .merge = ioc_rqos_merge, .done_bio = ioc_rqos_done_bio, .done = ioc_rqos_done, .queue_depth_changed = ioc_rqos_queue_depth_changed, .exit = ioc_rqos_exit, }; static int blk_iocost_init(struct gendisk *disk) { struct ioc *ioc; int i, cpu, ret; ioc = kzalloc(sizeof(*ioc), GFP_KERNEL); if (!ioc) return -ENOMEM; ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat); if (!ioc->pcpu_stat) { kfree(ioc); return -ENOMEM; } for_each_possible_cpu(cpu) { struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu); for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) { local_set(&ccs->missed[i].nr_met, 0); local_set(&ccs->missed[i].nr_missed, 0); } local64_set(&ccs->rq_wait_ns, 0); } spin_lock_init(&ioc->lock); timer_setup(&ioc->timer, ioc_timer_fn, 0); INIT_LIST_HEAD(&ioc->active_iocgs); ioc->running = IOC_IDLE; ioc->vtime_base_rate = VTIME_PER_USEC; atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); ioc->period_at = ktime_to_us(blk_time_get()); atomic64_set(&ioc->cur_period, 0); atomic_set(&ioc->hweight_gen, 0); spin_lock_irq(&ioc->lock); ioc->autop_idx = AUTOP_INVALID; ioc_refresh_params_disk(ioc, true, disk); spin_unlock_irq(&ioc->lock); /* * rqos must be added before activation to allow ioc_pd_init() to * lookup the ioc from q. This means that the rqos methods may get * called before policy activation completion, can't assume that the * target bio has an iocg associated and need to test for NULL iocg. */ ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops); if (ret) goto err_free_ioc; ret = blkcg_activate_policy(disk, &blkcg_policy_iocost); if (ret) goto err_del_qos; return 0; err_del_qos: rq_qos_del(&ioc->rqos); err_free_ioc: free_percpu(ioc->pcpu_stat); kfree(ioc); return ret; } static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) { struct ioc_cgrp *iocc; iocc = kzalloc(sizeof(struct ioc_cgrp), gfp); if (!iocc) return NULL; iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE; return &iocc->cpd; } static void ioc_cpd_free(struct blkcg_policy_data *cpd) { kfree(container_of(cpd, struct ioc_cgrp, cpd)); } static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { int levels = blkcg->css.cgroup->level + 1; struct ioc_gq *iocg; iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, disk->node_id); if (!iocg) return NULL; iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp); if (!iocg->pcpu_stat) { kfree(iocg); return NULL; } return &iocg->pd; } static void ioc_pd_init(struct blkg_policy_data *pd) { struct ioc_gq *iocg = pd_to_iocg(pd); struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd); struct ioc *ioc = q_to_ioc(blkg->q); struct ioc_now now; struct blkcg_gq *tblkg; unsigned long flags; ioc_now(ioc, &now); iocg->ioc = ioc; atomic64_set(&iocg->vtime, now.vnow); atomic64_set(&iocg->done_vtime, now.vnow); atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); INIT_LIST_HEAD(&iocg->active_list); INIT_LIST_HEAD(&iocg->walk_list); INIT_LIST_HEAD(&iocg->surplus_list); iocg->hweight_active = WEIGHT_ONE; iocg->hweight_inuse = WEIGHT_ONE; init_waitqueue_head(&iocg->waitq); hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); iocg->waitq_timer.function = iocg_waitq_timer_fn; iocg->level = blkg->blkcg->css.cgroup->level; for (tblkg = blkg; tblkg; tblkg = tblkg->parent) { struct ioc_gq *tiocg = blkg_to_iocg(tblkg); iocg->ancestors[tiocg->level] = tiocg; } spin_lock_irqsave(&ioc->lock, flags); weight_updated(iocg, &now); spin_unlock_irqrestore(&ioc->lock, flags); } static void ioc_pd_free(struct blkg_policy_data *pd) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; unsigned long flags; if (ioc) { spin_lock_irqsave(&ioc->lock, flags); if (!list_empty(&iocg->active_list)) { struct ioc_now now; ioc_now(ioc, &now); propagate_weights(iocg, 0, 0, false, &now); list_del_init(&iocg->active_list); } WARN_ON_ONCE(!list_empty(&iocg->walk_list)); WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); spin_unlock_irqrestore(&ioc->lock, flags); hrtimer_cancel(&iocg->waitq_timer); } free_percpu(iocg->pcpu_stat); kfree(iocg); } static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; if (!ioc->enabled) return; if (iocg->level == 0) { unsigned vp10k = DIV64_U64_ROUND_CLOSEST( ioc->vtime_base_rate * 10000, VTIME_PER_USEC); seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100); } seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us); if (blkcg_debug_stats) seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", iocg->last_stat.wait_us, iocg->last_stat.indebt_us, iocg->last_stat.indelay_us); } static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { const char *dname = blkg_dev_name(pd->blkg); struct ioc_gq *iocg = pd_to_iocg(pd); if (dname && iocg->cfg_weight) seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE); return 0; } static int ioc_weight_show(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE); blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; } static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); struct blkg_conf_ctx ctx; struct ioc_now now; struct ioc_gq *iocg; u32 v; int ret; if (!strchr(buf, ':')) { struct blkcg_gq *blkg; if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v)) return -EINVAL; if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) return -EINVAL; spin_lock_irq(&blkcg->lock); iocc->dfl_weight = v * WEIGHT_ONE; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct ioc_gq *iocg = blkg_to_iocg(blkg); if (iocg) { spin_lock(&iocg->ioc->lock); ioc_now(iocg->ioc, &now); weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); } } spin_unlock_irq(&blkcg->lock); return nbytes; } blkg_conf_init(&ctx, buf); ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx); if (ret) goto err; iocg = blkg_to_iocg(ctx.blkg); if (!strncmp(ctx.body, "default", 7)) { v = 0; } else { if (!sscanf(ctx.body, "%u", &v)) goto einval; if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) goto einval; } spin_lock(&iocg->ioc->lock); iocg->cfg_weight = v * WEIGHT_ONE; ioc_now(iocg->ioc, &now); weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); blkg_conf_exit(&ctx); return nbytes; einval: ret = -EINVAL; err: blkg_conf_exit(&ctx); return ret; } static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { const char *dname = blkg_dev_name(pd->blkg); struct ioc *ioc = pd_to_iocg(pd)->ioc; if (!dname) return 0; spin_lock_irq(&ioc->lock); seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n", dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto", ioc->params.qos[QOS_RPPM] / 10000, ioc->params.qos[QOS_RPPM] % 10000 / 100, ioc->params.qos[QOS_RLAT], ioc->params.qos[QOS_WPPM] / 10000, ioc->params.qos[QOS_WPPM] % 10000 / 100, ioc->params.qos[QOS_WLAT], ioc->params.qos[QOS_MIN] / 10000, ioc->params.qos[QOS_MIN] % 10000 / 100, ioc->params.qos[QOS_MAX] / 10000, ioc->params.qos[QOS_MAX] % 10000 / 100); spin_unlock_irq(&ioc->lock); return 0; } static int ioc_qos_show(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; } static const match_table_t qos_ctrl_tokens = { { QOS_ENABLE, "enable=%u" }, { QOS_CTRL, "ctrl=%s" }, { NR_QOS_CTRL_PARAMS, NULL }, }; static const match_table_t qos_tokens = { { QOS_RPPM, "rpct=%s" }, { QOS_RLAT, "rlat=%u" }, { QOS_WPPM, "wpct=%s" }, { QOS_WLAT, "wlat=%u" }, { QOS_MIN, "min=%s" }, { QOS_MAX, "max=%s" }, { NR_QOS_PARAMS, NULL }, }; static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { struct blkg_conf_ctx ctx; struct gendisk *disk; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; char *body, *p; int ret; blkg_conf_init(&ctx, input); ret = blkg_conf_open_bdev(&ctx); if (ret) goto err; body = ctx.body; disk = ctx.bdev->bd_disk; if (!queue_is_mq(disk->queue)) { ret = -EOPNOTSUPP; goto err; } ioc = q_to_ioc(disk->queue); if (!ioc) { ret = blk_iocost_init(disk); if (ret) goto err; ioc = q_to_ioc(disk->queue); } blk_mq_freeze_queue(disk->queue); blk_mq_quiesce_queue(disk->queue); spin_lock_irq(&ioc->lock); memcpy(qos, ioc->params.qos, sizeof(qos)); enable = ioc->enabled; user = ioc->user_qos_params; while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; s64 v; if (!*p) continue; switch (match_token(p, qos_ctrl_tokens, args)) { case QOS_ENABLE: if (match_u64(&args[0], &v)) goto einval; enable = v; continue; case QOS_CTRL: match_strlcpy(buf, &args[0], sizeof(buf)); if (!strcmp(buf, "auto")) user = false; else if (!strcmp(buf, "user")) user = true; else goto einval; continue; } tok = match_token(p, qos_tokens, args); switch (tok) { case QOS_RPPM: case QOS_WPPM: if (match_strlcpy(buf, &args[0], sizeof(buf)) >= sizeof(buf)) goto einval; if (cgroup_parse_float(buf, 2, &v)) goto einval; if (v < 0 || v > 10000) goto einval; qos[tok] = v * 100; break; case QOS_RLAT: case QOS_WLAT: if (match_u64(&args[0], &v)) goto einval; qos[tok] = v; break; case QOS_MIN: case QOS_MAX: if (match_strlcpy(buf, &args[0], sizeof(buf)) >= sizeof(buf)) goto einval; if (cgroup_parse_float(buf, 2, &v)) goto einval; if (v < 0) goto einval; qos[tok] = clamp_t(s64, v * 100, VRATE_MIN_PPM, VRATE_MAX_PPM); break; default: goto einval; } user = true; } if (qos[QOS_MIN] > qos[QOS_MAX]) goto einval; if (enable && !ioc->enabled) { blk_stat_enable_accounting(disk->queue); blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; } else if (!enable && ioc->enabled) { blk_stat_disable_accounting(disk->queue); blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; } if (user) { memcpy(ioc->params.qos, qos, sizeof(qos)); ioc->user_qos_params = true; } else { ioc->user_qos_params = false; } ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); if (enable) wbt_disable_default(disk); else wbt_enable_default(disk); blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue); blkg_conf_exit(&ctx); return nbytes; einval: spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue); ret = -EINVAL; err: blkg_conf_exit(&ctx); return ret; } static u64 ioc_cost_model_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { const char *dname = blkg_dev_name(pd->blkg); struct ioc *ioc = pd_to_iocg(pd)->ioc; u64 *u = ioc->params.i_lcoefs; if (!dname) return 0; spin_lock_irq(&ioc->lock); seq_printf(sf, "%s ctrl=%s model=linear " "rbps=%llu rseqiops=%llu rrandiops=%llu " "wbps=%llu wseqiops=%llu wrandiops=%llu\n", dname, ioc->user_cost_model ? "user" : "auto", u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]); spin_unlock_irq(&ioc->lock); return 0; } static int ioc_cost_model_show(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; } static const match_table_t cost_ctrl_tokens = { { COST_CTRL, "ctrl=%s" }, { COST_MODEL, "model=%s" }, { NR_COST_CTRL_PARAMS, NULL }, }; static const match_table_t i_lcoef_tokens = { { I_LCOEF_RBPS, "rbps=%u" }, { I_LCOEF_RSEQIOPS, "rseqiops=%u" }, { I_LCOEF_RRANDIOPS, "rrandiops=%u" }, { I_LCOEF_WBPS, "wbps=%u" }, { I_LCOEF_WSEQIOPS, "wseqiops=%u" }, { I_LCOEF_WRANDIOPS, "wrandiops=%u" }, { NR_I_LCOEFS, NULL }, }; static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { struct blkg_conf_ctx ctx; struct request_queue *q; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; char *body, *p; int ret; blkg_conf_init(&ctx, input); ret = blkg_conf_open_bdev(&ctx); if (ret) goto err; body = ctx.body; q = bdev_get_queue(ctx.bdev); if (!queue_is_mq(q)) { ret = -EOPNOTSUPP; goto err; } ioc = q_to_ioc(q); if (!ioc) { ret = blk_iocost_init(ctx.bdev->bd_disk); if (ret) goto err; ioc = q_to_ioc(q); } blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); spin_lock_irq(&ioc->lock); memcpy(u, ioc->params.i_lcoefs, sizeof(u)); user = ioc->user_cost_model; while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; u64 v; if (!*p) continue; switch (match_token(p, cost_ctrl_tokens, args)) { case COST_CTRL: match_strlcpy(buf, &args[0], sizeof(buf)); if (!strcmp(buf, "auto")) user = false; else if (!strcmp(buf, "user")) user = true; else goto einval; continue; case COST_MODEL: match_strlcpy(buf, &args[0], sizeof(buf)); if (strcmp(buf, "linear")) goto einval; continue; } tok = match_token(p, i_lcoef_tokens, args); if (tok == NR_I_LCOEFS) goto einval; if (match_u64(&args[0], &v)) goto einval; u[tok] = v; user = true; } if (user) { memcpy(ioc->params.i_lcoefs, u, sizeof(u)); ioc->user_cost_model = true; } else { ioc->user_cost_model = false; } ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); blkg_conf_exit(&ctx); return nbytes; einval: spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); ret = -EINVAL; err: blkg_conf_exit(&ctx); return ret; } static struct cftype ioc_files[] = { { .name = "weight", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = ioc_weight_show, .write = ioc_weight_write, }, { .name = "cost.qos", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = ioc_qos_show, .write = ioc_qos_write, }, { .name = "cost.model", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = ioc_cost_model_show, .write = ioc_cost_model_write, }, {} }; static struct blkcg_policy blkcg_policy_iocost = { .dfl_cftypes = ioc_files, .cpd_alloc_fn = ioc_cpd_alloc, .cpd_free_fn = ioc_cpd_free, .pd_alloc_fn = ioc_pd_alloc, .pd_init_fn = ioc_pd_init, .pd_free_fn = ioc_pd_free, .pd_stat_fn = ioc_pd_stat, }; static int __init ioc_init(void) { return blkcg_policy_register(&blkcg_policy_iocost); } static void __exit ioc_exit(void) { blkcg_policy_unregister(&blkcg_policy_iocost); } module_init(ioc_init); module_exit(ioc_exit);
2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 // SPDX-License-Identifier: GPL-2.0-only /* * test/set flag bits stored in conntrack extension area. * * (C) 2013 Astaro GmbH & Co KG */ #include <linux/export.h> #include <linux/types.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> static int replace_u32(u32 *address, u32 mask, u32 new) { u32 old, tmp; do { old = *address; tmp = (old & mask) ^ new; if (old == tmp) return 0; } while (cmpxchg(address, old, tmp) != old); return 1; } int nf_connlabels_replace(struct nf_conn *ct, const u32 *data, const u32 *mask, unsigned int words32) { struct nf_conn_labels *labels; unsigned int size, i; int changed = 0; u32 *dst; labels = nf_ct_labels_find(ct); if (!labels) return -ENOSPC; size = sizeof(labels->bits); if (size < (words32 * sizeof(u32))) words32 = size / sizeof(u32); dst = (u32 *) labels->bits; for (i = 0; i < words32; i++) changed |= replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]); size /= sizeof(u32); for (i = words32; i < size; i++) /* pad */ replace_u32(&dst[i], 0, 0); if (changed) nf_conntrack_event_cache(IPCT_LABEL, ct); return 0; } EXPORT_SYMBOL_GPL(nf_connlabels_replace); int nf_connlabels_get(struct net *net, unsigned int bits) { int v; if (BIT_WORD(bits) >= NF_CT_LABELS_MAX_SIZE / sizeof(long)) return -ERANGE; BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX); v = atomic_inc_return_relaxed(&net->ct.labels_used); WARN_ON_ONCE(v <= 0); return 0; } EXPORT_SYMBOL_GPL(nf_connlabels_get); void nf_connlabels_put(struct net *net) { int v = atomic_dec_return_relaxed(&net->ct.labels_used); WARN_ON_ONCE(v < 0); } EXPORT_SYMBOL_GPL(nf_connlabels_put);
3 3 3 3 2347 2139 118 2339 2332 1769 121 12 1 8 8 8 3 3 3 3 23 11 11 7 11 6 5 11 26 25 25 18 9 10 5 10 18 18 14 22 14 1 1 1 1 1 1 2 3 3 1 2 6 6 1 10 8 6 8 8 8 8 8 8 6 6 6 6 7 2 2 1 2 2 2 1 1 1 1 1 2 1 2 17 18 18 18 8 18 9 3 18 2 18 18 17 17 9 9 9 3 9 2 14 11 15 7 15 9 9 9 6 6 6 6 6 3 6 1 6 3 2 3 2 2 2 2 1 2 1 1 2 2 1 1 1 1 2 2 2 6 6 6 6 5 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 6 1 6 1 1 1 5 5 5 17 17 17 17 7 17 17 17 1 16 7 16 7 16 16 16 15 14 14 14 14 6 5 5 6 6 15 13 6 14 14 2 14 15 5 13 14 14 13 14 27 12 26 15 14 10 10 1 10 10 10 27 3 1 1 1 1 1 24 25 2 2 18 17 17 2 2 2 2 2 2 2 1 2 1 1 1 1 1 2 2 19 19 7 6 6 7 7 3 3 3 3 3 3 1 2 2 2 2 2 2 2 3 1 3 3 1 3 2 2 1 1 3 3 18 19 3 19 10 10 19 539 26 536 535 530 535 535 2 534 534 1 1 1 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1851 1851 3 1 1 1 1 1850 1851 2 2 2 2 2 2 2 2 2 2 2 2397 2344 1 1 1 1 1 1 3 3 2 2389 16 2112 2108 2116 1 1 1 1 1 511 514 515 1840 1786 1840 63 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 718 721 722 720 2 719 719 718 719 722 247 247 247 11 11 11 10 11 10 9 9 4 4 4 3 1 3 1 4 3 1 3 3 3 3 1 4 1 1 1 1 1 4 110 3 3 3 111 2 2 1 2 4 4 4 4 3 3 1 1 1 1 1 1 1 1 3 3 3 3 1 1 1 186 1 1 1 1 1 1 1 186 3 3 3 3 12 1 1 1 12 8 5 8 6 4 2 4 4 1 1 1 1 1 1 1 1 1 3 2 1 2 2 1 1 1 1 4 5 1 5 2 7 4 4 1 1 4 1 1 1 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 // SPDX-License-Identifier: GPL-2.0-only /* * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. * * NUMA policy allows the user to give hints in which node(s) memory should * be allocated. * * Support four policies per VMA and per process: * * The VMA policy has priority over the process policy for a page fault. * * interleave Allocate memory interleaved over a set of nodes, * with normal fallback if it fails. * For VMA based allocations this interleaves based on the * offset into the backing object or offset into the mapping * for anonymous memory. For process policy an process counter * is used. * * weighted interleave * Allocate memory interleaved over a set of nodes based on * a set of weights (per-node), with normal fallback if it * fails. Otherwise operates the same as interleave. * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated * on node 0 for every 1 page allocated on node 1. * * bind Only allocate memory on a specific set of nodes, * no fallback. * FIXME: memory is allocated starting with the first node * to the last. It would be better if bind would truly restrict * the allocation to memory nodes instead * * preferred Try a specific node first before normal fallback. * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. * * preferred many Try a set of nodes first before normal fallback. This is * similar to preferred without the special case. * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. * * The process policy is applied for most non interrupt memory allocations * in that process' context. Interrupts ignore the policies and always * try to allocate on the local CPU. The VMA policy is only applied for memory * allocations for a VMA in the VM. * * Currently there are a few corner cases in swapping where the policy * is not applied, but the majority should be handled. When process policy * is used it is not remembered over swap outs/swap ins. * * Only the highest zone in the zone hierarchy gets policied. Allocations * requesting a lower zone just use default policy. This implies that * on systems with highmem kernel lowmem allocation don't get policied. * Same with GFP_DMA allocations. * * For shmem/tmpfs shared memory the policy is shared between * all users and remembered even when nobody has memory mapped. */ /* Notebook: fix mmap readahead to honour policy and enable policy for any page cache object statistics for bigpages global policy for page cache? currently it uses process policy. Requires first item above. handle mremap for shared memory (currently ignored for the policy) grows down? make bind policy root only? It can trigger oom much faster and the kernel is not always grateful with that. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mempolicy.h> #include <linux/pagewalk.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/nodemask.h> #include <linux/cpuset.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ptrace.h> #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/migrate.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ctype.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <linux/uaccess.h> #include "internal.h" /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; /* Highest zone. An specific allocation for a zone below that is not policied. */ enum zone_type policy_zone = 0; /* * run-time system-wide default policy => local allocation */ static struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ .mode = MPOL_LOCAL, }; static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /* * iw_table is the sysfs-set interleave weight table, a value of 0 denotes * system-default value should be used. A NULL iw_table also denotes that * system-default values should be used. Until the system-default table * is implemented, the system-default is always 1. * * iw_table is RCU protected */ static u8 __rcu *iw_table; static DEFINE_MUTEX(iw_table_lock); static u8 get_il_weight(int node) { u8 *table; u8 weight; rcu_read_lock(); table = rcu_dereference(iw_table); /* if no iw_table, use system default */ weight = table ? table[node] : 1; /* if value in iw_table is 0, use system default */ weight = weight ? weight : 1; rcu_read_unlock(); return weight; } /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search * @state: State to filter the search * * Lookup the closest node by distance if @nid is not in state. * * Return: this @node if it is in state, otherwise the closest node by distance */ int numa_nearest_node(int node, unsigned int state) { int min_dist = INT_MAX, dist, n, min_node; if (state >= NR_NODE_STATES) return -EINVAL; if (node == NUMA_NO_NODE || node_state(node, state)) return node; min_node = node; for_each_node_state(n, state) { dist = node_distance(node, n); if (dist < min_dist) { min_dist = dist; min_node = n; } } return min_node; } EXPORT_SYMBOL_GPL(numa_nearest_node); struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; int node; if (pol) return pol; node = numa_node_id(); if (node != NUMA_NO_NODE) { pol = &preferred_node_policy[node]; /* preferred_node_policy is not initialised early in boot */ if (pol->mode) return pol; } return &default_policy; } static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { return pol->flags & MPOL_MODE_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, const nodemask_t *rel) { nodemask_t tmp; nodes_fold(tmp, *orig, nodes_weight(*rel)); nodes_onto(*ret, tmp, *rel); } static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; pol->nodes = *nodes; return 0; } static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; nodes_clear(pol->nodes); node_set(first_node(*nodes), pol->nodes); return 0; } /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes * parameter with respect to the policy mode and flags. * * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_lock for write. */ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes, struct nodemask_scratch *nsc) { int ret; /* * Default (pol==NULL) resp. local memory policies are not a * subject of any remapping. They also do not need any special * constructor. */ if (!pol || pol->mode == MPOL_LOCAL) return 0; /* Check N_MEMORY */ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); VM_BUG_ON(!nodes); if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); else nodes_and(nsc->mask2, *nodes, nsc->mask1); if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); return ret; } /* * This function just creates a new policy, does some check and simple * initialization. You must invoke mpol_set_nodemask() to set nodes. */ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *policy; if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); return NULL; } VM_BUG_ON(!nodes); /* * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). * All other modes require a valid pointer to a non-empty nodemask. */ if (mode == MPOL_PREFERRED) { if (nodes_empty(*nodes)) { if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); mode = MPOL_LOCAL; } } else if (mode == MPOL_LOCAL) { if (!nodes_empty(*nodes) || (flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; policy->home_node = NUMA_NO_NODE; return policy; } /* Slow path of a mpol destructor. */ void __mpol_put(struct mempolicy *pol) { if (!atomic_dec_and_test(&pol->refcnt)) return; kmem_cache_free(policy_cache, pol); } static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; if (pol->flags & MPOL_F_STATIC_NODES) nodes_and(tmp, pol->w.user_nodemask, *nodes); else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } if (nodes_empty(tmp)) tmp = *nodes; pol->nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, const nodemask_t *nodes) { pol->w.cpuset_mems_allowed = *nodes; } /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * * Per-vma policies are protected by mmap_lock. Allocations using per-task * policies are protected by task->mems_allowed_seq to prevent a premature * OOM/allocation failure due to parallel nodemask modification. */ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol || pol->mode == MPOL_LOCAL) return; if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; mpol_ops[pol->mode].rebind(pol, newmask); } /* * Wrapper for mpol_rebind_policy() that just requires task * pointer, and updates task mempolicy. * * Called with task's alloc_lock held. */ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { mpol_rebind_policy(tsk->mempolicy, new); } /* * Rebind each vma in mm to new nodemask. * * Call holding a reference to mm. Takes mm->mmap_lock during call. */ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); for_each_vma(vmi, vma) { vma_start_write(vma); mpol_rebind_policy(vma->vm_policy, new); } mmap_write_unlock(mm); } static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { [MPOL_DEFAULT] = { .rebind = mpol_rebind_default, }, [MPOL_INTERLEAVE] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_PREFERRED] = { .create = mpol_new_preferred, .rebind = mpol_rebind_preferred, }, [MPOL_BIND] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_LOCAL] = { .rebind = mpol_rebind_default, }, [MPOL_PREFERRED_MANY] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_preferred, }, [MPOL_WEIGHTED_INTERLEAVE] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, }; static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, pgoff_t ilx, int *nid); static bool strictly_unmovable(unsigned long flags) { /* * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO * if any misplaced page is found. */ return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == MPOL_MF_STRICT; } struct migration_mpol { /* for alloc_migration_target_by_mpol() */ struct mempolicy *pol; pgoff_t ilx; }; struct queue_pages { struct list_head *pagelist; unsigned long flags; nodemask_t *nmask; unsigned long start; unsigned long end; struct vm_area_struct *first; struct folio *large; /* note last large folio encountered */ long nr_failed; /* could not be isolated at this time */ }; /* * Check if the folio's nid is in qp->nmask. * * If MPOL_MF_INVERT is set in qp->flags, check if the nid is * in the invert of qp->nmask. */ static inline bool queue_folio_required(struct folio *folio, struct queue_pages *qp) { int nid = folio_nid(folio); unsigned long flags = qp->flags; return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) { struct folio *folio; struct queue_pages *qp = walk->private; if (unlikely(is_pmd_migration_entry(*pmd))) { qp->nr_failed++; return; } folio = pmd_folio(*pmd); if (is_huge_zero_folio(folio)) { walk->action = ACTION_CONTINUE; return; } if (!queue_folio_required(folio, qp)) return; if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(walk->vma) || !migrate_folio_add(folio, qp->pagelist, qp->flags)) qp->nr_failed++; } /* * Scan through folios, checking if they satisfy the required conditions, * moving them from LRU to local pagelist for migration if they do (or not). * * queue_folios_pte_range() has two possible return values: * 0 - continue walking to scan for more, even if an existing folio on the * wrong node could not be isolated and queued for migration. * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, * and an existing folio was on a node that does not follow the policy. */ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; pte_t *pte, *mapped_pte; pte_t ptent; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { queue_folios_pmd(pmd, walk); spin_unlock(ptl); goto out; } mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (pte_none(ptent)) continue; if (!pte_present(ptent)) { if (is_migration_entry(pte_to_swp_entry(ptent))) qp->nr_failed++; continue; } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * vm_normal_folio() filters out zero pages, but there might * still be reserved folios to skip, perhaps in a VDSO. */ if (folio_test_reserved(folio)) continue; if (!queue_folio_required(folio, qp)) continue; if (folio_test_large(folio)) { /* * A large folio can only be isolated from LRU once, * but may be mapped by many PTEs (and Copy-On-Write may * intersperse PTEs of other, order 0, folios). This is * a common case, so don't mistake it for failure (but * there can be other cases of multi-mapped pages which * this quick check does not help to filter out - and a * search of the pagelist might grow to be prohibitive). * * migrate_pages(&pagelist) returns nr_failed folios, so * check "large" now so that queue_pages_range() returns * a comparable nr_failed folios. This does imply that * if folio could not be isolated for some racy reason * at its first PTE, later PTEs will not give it another * chance of isolation; but keeps the accounting simple. */ if (folio == qp->large) continue; qp->large = folio; } if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(vma) || !migrate_folio_add(folio, qp->pagelist, flags)) { qp->nr_failed++; if (strictly_unmovable(flags)) break; } } pte_unmap_unlock(mapped_pte, ptl); cond_resched(); out: if (qp->nr_failed && strictly_unmovable(flags)) return -EIO; return 0; } static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; struct folio *folio; spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); entry = huge_ptep_get(pte); if (!pte_present(entry)) { if (unlikely(is_hugetlb_entry_migration(entry))) qp->nr_failed++; goto unlock; } folio = pfn_folio(pte_pfn(entry)); if (!queue_folio_required(folio, qp)) goto unlock; if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(walk->vma)) { qp->nr_failed++; goto unlock; } /* * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * * See folio_likely_mapped_shared() on possible imprecision when we * cannot easily detect if a folio is shared. */ if ((flags & MPOL_MF_MOVE_ALL) || (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) if (!isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: spin_unlock(ptl); if (qp->nr_failed && strictly_unmovable(flags)) return -EIO; #endif return 0; } #ifdef CONFIG_NUMA_BALANCING /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these * faults, pages may be migrated for better NUMA placement. * * This is assuming that NUMA faults are handled using PROT_NONE. If * an architecture makes a different choice, it will need further * changes to the core. */ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct mmu_gather tlb; long nr_updated; tlb_gather_mmu(&tlb, vma->vm_mm); nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); if (nr_updated > 0) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); tlb_finish_mmu(&tlb); return nr_updated; } #endif /* CONFIG_NUMA_BALANCING */ static int queue_pages_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; /* range check first */ VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); if (!qp->first) { qp->first = vma; if (!(flags & MPOL_MF_DISCONTIG_OK) && (qp->start < vma->vm_start)) /* hole at head side of range */ return -EFAULT; } next = find_vma(vma->vm_mm, vma->vm_end); if (!(flags & MPOL_MF_DISCONTIG_OK) && ((vma->vm_end < qp->end) && (!next || vma->vm_end < next->vm_start))) /* hole at middle or tail of range */ return -EFAULT; /* * Need check MPOL_MF_STRICT to return -EIO if possible * regardless of vma_migratable */ if (!vma_migratable(vma) && !(flags & MPOL_MF_STRICT)) return 1; /* * Check page nodes, and queue pages to move, in the current vma. * But if no moving, and no strict checking, the scan can be skipped. */ if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, .walk_lock = PGWALK_RDLOCK, }; static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, .walk_lock = PGWALK_WRLOCK, }; /* * Walk through page tables and collect pages to be migrated. * * If pages found in a given range are not on the required set of @nodes, * and migration is allowed, they are isolated and queued to @pagelist. * * queue_pages_range() may return: * 0 - all pages already on the right node, or successfully queued for moving * (or neither strict checking nor moving requested: only range checking). * >0 - this number of misplaced folios could not be queued for moving * (a hugetlbfs page or a transparent huge page being counted as 1). * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. */ static long queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, struct list_head *pagelist) { int err; struct queue_pages qp = { .pagelist = pagelist, .flags = flags, .nmask = nodes, .start = start, .end = end, .first = NULL, }; const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; err = walk_page_range(mm, start, end, ops, &qp); if (!qp.first) /* whole range in hole */ err = -EFAULT; return err ? : qp.nr_failed; } /* * Apply policy to a single VMA * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct mempolicy *old; struct mempolicy *new; vma_assert_write_locked(vma); new = mpol_dup(pol); if (IS_ERR(new)) return PTR_ERR(new); if (vma->vm_ops && vma->vm_ops->set_policy) { err = vma->vm_ops->set_policy(vma, new); if (err) goto err_out; } old = vma->vm_policy; vma->vm_policy = new; /* protected by mmap_lock */ mpol_put(old); return 0; err_out: mpol_put(new); return err; } /* Split or merge the VMA (if required) and apply the new policy */ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, struct mempolicy *new_pol) { unsigned long vmstart, vmend; vmend = min(end, vma->vm_end); if (start > vma->vm_start) { *prev = vma; vmstart = start; } else { vmstart = vma->vm_start; } if (mpol_equal(vma->vm_policy, new_pol)) { *prev = vma; return 0; } vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); if (IS_ERR(vma)) return PTR_ERR(vma); *prev = vma; return vma_replace_policy(vma, new_pol); } /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *new, *old; NODEMASK_SCRATCH(scratch); int ret; if (!scratch) return -ENOMEM; new = mpol_new(mode, flags, nodes); if (IS_ERR(new)) { ret = PTR_ERR(new); goto out; } task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); mpol_put(new); goto out; } old = current->mempolicy; current->mempolicy = new; if (new && (new->mode == MPOL_INTERLEAVE || new->mode == MPOL_WEIGHTED_INTERLEAVE)) { current->il_prev = MAX_NUMNODES-1; current->il_weight = 0; } task_unlock(current); mpol_put(old); ret = 0; out: NODEMASK_SCRATCH_FREE(scratch); return ret; } /* * Return nodemask for policy for get_mempolicy() query * * Called with task's alloc_lock held */ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) { nodes_clear(*nodes); if (pol == &default_policy) return; switch (pol->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_WEIGHTED_INTERLEAVE: *nodes = pol->nodes; break; case MPOL_LOCAL: /* return empty node mask for local allocation */ break; default: BUG(); } } static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p = NULL; int ret; ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); if (ret > 0) { ret = page_to_nid(p); put_page(p); } return ret; } /* Retrieve NUMA policy */ static long do_get_mempolicy(int *policy, nodemask_t *nmask, unsigned long addr, unsigned long flags) { int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; if (flags & MPOL_F_MEMS_ALLOWED) { if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ task_lock(current); *nmask = cpuset_current_mems_allowed; task_unlock(current); return 0; } if (flags & MPOL_F_ADDR) { pgoff_t ilx; /* ignored here */ /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We * want to return MPOL_DEFAULT in this case. */ mmap_read_lock(mm); vma = vma_lookup(mm, addr); if (!vma) { mmap_read_unlock(mm); return -EFAULT; } pol = __get_vma_policy(vma, addr, &ilx); } else if (addr) return -EINVAL; if (!pol) pol = &default_policy; /* indicates default behavior */ if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { /* * Take a refcount on the mpol, because we are about to * drop the mmap_lock, after which only "pol" remains * valid, "vma" is stale. */ pol_refcount = pol; vma = NULL; mpol_get(pol); mmap_read_unlock(mm); err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = next_node_in(current->il_prev, pol->nodes); } else if (pol == current->mempolicy && pol->mode == MPOL_WEIGHTED_INTERLEAVE) { if (current->il_weight) *policy = current->il_prev; else *policy = next_node_in(current->il_prev, pol->nodes); } else { err = -EINVAL; goto out; } } else { *policy = pol == &default_policy ? MPOL_DEFAULT : pol->mode; /* * Internal mempolicy flags must be masked off before exposing * the policy to userspace. */ *policy |= (pol->flags & MPOL_MODE_FLAGS); } err = 0; if (nmask) { if (mpol_store_user_nodemask(pol)) { *nmask = pol->w.user_nodemask; } else { task_lock(current); get_policy_nodemask(pol, nmask); task_unlock(current); } } out: mpol_cond_put(pol); if (vma) mmap_read_unlock(mm); if (pol_refcount) mpol_put(pol_refcount); return err; } #ifdef CONFIG_MIGRATION static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { /* * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * * See folio_likely_mapped_shared() on possible imprecision when we * cannot easily detect if a folio is shared. */ if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) { if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } else { /* * Non-movable folio may reach here. And, there may be * temporary off LRU folios or non-LRU movable folios. * Treat them as unmovable folios since they can't be * isolated, so they can't be moved at the moment. */ return false; } } return true; } /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ static long migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) { nodemask_t nmask; struct vm_area_struct *vma; LIST_HEAD(pagelist); long nr_failed; long err = 0; struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, .reason = MR_SYSCALL, }; nodes_clear(nmask); node_set(source, nmask); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); mmap_read_lock(mm); vma = find_vma(mm, 0); /* * This does not migrate the range, but isolates all pages that * need migration. Between passing in the full user address * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, * but passes back the count of pages which could not be isolated. */ nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); mmap_read_unlock(mm); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } if (err >= 0) err += nr_failed; return err; } /* * Move pages between the two nodesets so as to preserve the physical * layout as much as possible. * * Returns the number of page that could not be moved. */ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { long nr_failed = 0; long err = 0; nodemask_t tmp; lru_cache_disable(); /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' * bit in 'tmp', and return that <source, dest> pair for migration. * The pair of nodemasks 'to' and 'from' define the map. * * If no pair of bits is found that way, fallback to picking some * pair of 'source' and 'dest' bits that are not the same. If the * 'source' and 'dest' bits are the same, this represents a node * that will be migrating to itself, so no pages need move. * * If no bits are left in 'tmp', or if all remaining bits left * in 'tmp' correspond to the same bit in 'to', return false * (nothing left to migrate). * * This lets us pick a pair of nodes to migrate between, such that * if possible the dest node is not already occupied by some other * source node, minimizing the risk of overloading the memory on a * node that would happen if we migrated incoming memory to a node * before migrating outgoing memory source that same node. * * A single scan of tmp is sufficient. As we go, we remember the * most recent <s, d> pair that moved (s != d). If we find a pair * that not only moved, but what's better, moved to an empty slot * (d is not set in tmp), then we break out then, with that pair. * Otherwise when we finish scanning from_tmp, we at least have the * most recent <s, d> pair that moved. If we get all the way through * the scan of tmp without finding any node that moved, much less * moved to an empty node, then there is nothing left worth migrating. */ tmp = *from; while (!nodes_empty(tmp)) { int s, d; int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { /* * do_migrate_pages() tries to maintain the relative * node relationship of the pages established between * threads and memory areas. * * However if the number of source nodes is not equal to * the number of destination nodes we can not preserve * this node relative relationship. In that case, skip * copying memory from a node that is in the destination * mask. * * Example: [2,3,4] -> [3,4,5] moves everything. * [0-7] - > [3,4,5] moves only 0,1,2,6,7. */ if ((nodes_weight(*from) != nodes_weight(*to)) && (node_isset(s, *to))) continue; d = node_remap(s, *from, *to); if (s == d) continue; source = s; /* Node moved. Memorize */ dest = d; /* dest not in remaining from nodes? */ if (!node_isset(dest, tmp)) break; } if (source == NUMA_NO_NODE) break; node_clear(source, tmp); err = migrate_to_node(mm, source, dest, flags); if (err > 0) nr_failed += err; if (err < 0) break; } lru_cache_enable(); if (err < 0) return err; return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; } /* * Allocate a new folio for page migration, according to NUMA mempolicy. */ static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { struct migration_mpol *mmpol = (struct migration_mpol *)private; struct mempolicy *pol = mmpol->pol; pgoff_t ilx = mmpol->ilx; struct page *page; unsigned int order; int nid = numa_node_id(); gfp_t gfp; order = folio_order(src); ilx += src->index >> order; if (folio_test_hugetlb(src)) { nodemask_t *nodemask; struct hstate *h; h = folio_hstate(src); gfp = htlb_alloc_mask(h); nodemask = policy_nodemask(gfp, pol, ilx, &nid); return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; else gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; page = alloc_pages_mpol(gfp, order, pol, ilx, nid); return page_rmappable_folio(page); } #else static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { return false; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return -ENOSYS; } static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { return NULL; } #endif static long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vma_iterator vmi; struct migration_mpol mmpol; struct mempolicy *new; unsigned long end; long err; long nr_failed; LIST_HEAD(pagelist); if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; if (start & ~PAGE_MASK) return -EINVAL; if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; len = PAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; new = mpol_new(mode, mode_flags, nmask); if (IS_ERR(new)) return PTR_ERR(new); /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all */ if (!new) flags |= MPOL_MF_DISCONTIG_OK; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_disable(); { NODEMASK_SCRATCH(scratch); if (scratch) { mmap_write_lock(mm); err = mpol_set_nodemask(new, nmask, scratch); if (err) mmap_write_unlock(mm); } else err = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); } if (err) goto mpol_out; /* * Lock the VMAs before scanning for pages to migrate, * to ensure we don't miss a concurrently inserted page. */ nr_failed = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); if (nr_failed < 0) { err = nr_failed; nr_failed = 0; } else { vma_iter_init(&vmi, mm, start); prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { err = mbind_range(&vmi, vma, &prev, start, end, new); if (err) break; } } if (!err && !list_empty(&pagelist)) { /* Convert MPOL_DEFAULT's NULL to task or default policy */ if (!new) { new = get_task_policy(current); mpol_get(new); } mmpol.pol = new; mmpol.ilx = 0; /* * In the interleaved case, attempt to allocate on exactly the * targeted nodes, for the first VMA to be migrated; for later * VMAs, the nodes will still be interleaved from the targeted * nodemask, but one by one may be selected differently. */ if (new->mode == MPOL_INTERLEAVE || new->mode == MPOL_WEIGHTED_INTERLEAVE) { struct folio *folio; unsigned int order; unsigned long addr = -EFAULT; list_for_each_entry(folio, &pagelist, lru) { if (!folio_test_ksm(folio)) break; } if (!list_entry_is_head(folio, &pagelist, lru)) { vma_iter_init(&vmi, mm, start); for_each_vma_range(vmi, vma, end) { addr = page_address_in_vma( folio_page(folio, 0), vma); if (addr != -EFAULT) break; } } if (addr != -EFAULT) { order = folio_order(folio); /* We already know the pol, but not the ilx */ mpol_cond_put(get_vma_policy(vma, addr, order, &mmpol.ilx)); /* Set base from which to increment by index */ mmpol.ilx -= folio->index >> order; } } } mmap_write_unlock(mm); if (!err && !list_empty(&pagelist)) { nr_failed |= migrate_pages(&pagelist, alloc_migration_target_by_mpol, NULL, (unsigned long)&mmpol, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); } if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_enable(); return err; } /* * User space interface with variable sized bitmaps for nodelists. */ static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long nlongs = BITS_TO_LONGS(maxnode); int ret; if (in_compat_syscall()) ret = compat_get_bitmap(mask, (const compat_ulong_t __user *)nmask, maxnode); else ret = copy_from_user(mask, nmask, nlongs * sizeof(unsigned long)); if (ret) return -EFAULT; if (maxnode % BITS_PER_LONG) mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; return 0; } /* Copy a node mask from user space. */ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; if (maxnode > PAGE_SIZE*BITS_PER_BYTE) return -EINVAL; /* * When the user specified more nodes than supported just check * if the non supported part is all zero, one word at a time, * starting at the end. */ while (maxnode > MAX_NUMNODES) { unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); unsigned long t; if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) return -EFAULT; if (maxnode - bits >= MAX_NUMNODES) { maxnode -= bits; } else { maxnode = MAX_NUMNODES; t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); } if (t) return -EINVAL; } return get_bitmap(nodes_addr(*nodes), nmask, maxnode); } /* Copy a kernel node mask to user space */ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); bool compat = in_compat_syscall(); if (compat) nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); if (copy > nbytes) { if (copy > PAGE_SIZE) return -EINVAL; if (clear_user((char __user *)mask + nbytes, copy - nbytes)) return -EFAULT; copy = nbytes; maxnode = nr_node_ids; } if (compat) return compat_put_bitmap((compat_ulong_t __user *)mask, nodes_addr(*nodes), maxnode); return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) { *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; if ((unsigned int)(*mode) >= MPOL_MAX) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; if (*flags & MPOL_F_NUMA_BALANCING) { if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) *flags |= (MPOL_F_MOF | MPOL_F_MORON); else return -EINVAL; } return 0; } static long kernel_mbind(unsigned long start, unsigned long len, unsigned long mode, const unsigned long __user *nmask, unsigned long maxnode, unsigned int flags) { unsigned short mode_flags; nodemask_t nodes; int lmode = mode; int err; start = untagged_addr(start); err = sanitize_mpol_flags(&lmode, &mode_flags); if (err) return err; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, unsigned long, home_node, unsigned long, flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct mempolicy *new, *old; unsigned long end; int err = -ENOENT; VMA_ITERATOR(vmi, mm, start); start = untagged_addr(start); if (start & ~PAGE_MASK) return -EINVAL; /* * flags is used for future extension if any. */ if (flags != 0) return -EINVAL; /* * Check home_node is online to avoid accessing uninitialized * NODE_DATA. */ if (home_node >= MAX_NUMNODES || !node_online(home_node)) return -EINVAL; len = PAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; mmap_write_lock(mm); prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { /* * If any vma in the range got policy other than MPOL_BIND * or MPOL_PREFERRED_MANY we return error. We don't reset * the home node for vmas we already updated before. */ old = vma_policy(vma); if (!old) { prev = vma; continue; } if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { err = -EOPNOTSUPP; break; } new = mpol_dup(old); if (IS_ERR(new)) { err = PTR_ERR(new); break; } vma_start_write(vma); new->home_node = home_node; err = mbind_range(&vmi, vma, &prev, start, end, new); mpol_put(new); if (err) break; } mmap_write_unlock(mm); return err; } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) { return kernel_mbind(start, len, mode, nmask, maxnode, flags); } /* Set the process memory policy */ static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, unsigned long maxnode) { unsigned short mode_flags; nodemask_t nodes; int lmode = mode; int err; err = sanitize_mpol_flags(&lmode, &mode_flags); if (err) return err; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_set_mempolicy(lmode, mode_flags, &nodes); } SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) { return kernel_set_mempolicy(mode, nmask, maxnode); } static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) { struct mm_struct *mm = NULL; struct task_struct *task; nodemask_t task_nodes; int err; nodemask_t *old; nodemask_t *new; NODEMASK_SCRATCH(scratch); if (!scratch) return -ENOMEM; old = &scratch->mask1; new = &scratch->mask2; err = get_nodes(old, old_nodes, maxnode); if (err) goto out; err = get_nodes(new, new_nodes, maxnode); if (err) goto out; /* Find the mm_struct */ rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { rcu_read_unlock(); err = -ESRCH; goto out; } get_task_struct(task); err = -EINVAL; /* * Check if this process has the right to modify the specified process. * Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out_put; } rcu_read_unlock(); task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out_put; } task_nodes = cpuset_mems_allowed(current); nodes_and(*new, *new, task_nodes); if (nodes_empty(*new)) goto out_put; err = security_task_movememory(task); if (err) goto out_put; mm = get_task_mm(task); put_task_struct(task); if (!mm) { err = -EINVAL; goto out; } err = do_migrate_pages(mm, old, new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); mmput(mm); out: NODEMASK_SCRATCH_FREE(scratch); return err; out_put: put_task_struct(task); goto out; } SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, old_nodes, const unsigned long __user *, new_nodes) { return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); } /* Retrieve NUMA policy */ static int kernel_get_mempolicy(int __user *policy, unsigned long __user *nmask, unsigned long maxnode, unsigned long addr, unsigned long flags) { int err; int pval; nodemask_t nodes; if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; addr = untagged_addr(addr); err = do_get_mempolicy(&pval, &nodes, addr, flags); if (err) return err; if (policy && put_user(pval, policy)) return -EFAULT; if (nmask) err = copy_nodes_to_user(nmask, maxnode, &nodes); return err; } SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long __user *, nmask, unsigned long, maxnode, unsigned long, addr, unsigned long, flags) { return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); } bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return false; /* * DAX device mappings require predictable access latency, so avoid * incurring periodic faults. */ if (vma_is_dax(vma)) return false; if (is_vm_hugetlb_page(vma) && !hugepage_migration_supported(hstate_vma(vma))) return false; /* * Migration allocates pages in the highest zone. If we cannot * do so then migration (at least from node to node) is not * possible. */ if (vma->vm_file && gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) < policy_zone) return false; return true; } struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx) { *ilx = 0; return (vma->vm_ops && vma->vm_ops->get_policy) ? vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; } /* * get_vma_policy(@vma, @addr, @order, @ilx) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup * @order: 0, or appropriate huge_page_order for interleaving * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or * MPOL_WEIGHTED_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. * Shared policies [those marked as MPOL_F_SHARED] require an extra reference * count--added by the get_policy() vm_op, as appropriate--to protect against * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx) { struct mempolicy *pol; pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE || pol->mode == MPOL_WEIGHTED_INTERLEAVE) { *ilx += vma->vm_pgoff >> order; *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); } return pol; } bool vma_policy_mof(struct vm_area_struct *vma) { struct mempolicy *pol; if (vma->vm_ops && vma->vm_ops->get_policy) { bool ret = false; pgoff_t ilx; /* ignored here */ pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); if (pol && (pol->flags & MPOL_F_MOF)) ret = true; mpol_cond_put(pol); return ret; } pol = vma->vm_policy; if (!pol) pol = get_task_policy(current); return pol->flags & MPOL_F_MOF; } bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); /* * if policy->nodes has movable memory only, * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. * * policy->nodes is intersect with node_states[N_MEMORY]. * so if the following test fails, it implies * policy->nodes has movable memory only. */ if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) dynamic_policy_zone = ZONE_MOVABLE; return zone >= dynamic_policy_zone; } static unsigned int weighted_interleave_nodes(struct mempolicy *policy) { unsigned int node; unsigned int cpuset_mems_cookie; retry: /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ cpuset_mems_cookie = read_mems_allowed_begin(); node = current->il_prev; if (!current->il_weight || !node_isset(node, policy->nodes)) { node = next_node_in(node, policy->nodes); if (read_mems_allowed_retry(cpuset_mems_cookie)) goto retry; if (node == MAX_NUMNODES) return node; current->il_prev = node; current->il_weight = get_il_weight(node); } current->il_weight--; return node; } /* Do dynamic interleaving for a process */ static unsigned int interleave_nodes(struct mempolicy *policy) { unsigned int nid; unsigned int cpuset_mems_cookie; /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ do { cpuset_mems_cookie = read_mems_allowed_begin(); nid = next_node_in(current->il_prev, policy->nodes); } while (read_mems_allowed_retry(cpuset_mems_cookie)); if (nid < MAX_NUMNODES) current->il_prev = nid; return nid; } /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. */ unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; int node = numa_mem_id(); if (!in_task()) return node; policy = current->mempolicy; if (!policy) return node; switch (policy->mode) { case MPOL_PREFERRED: return first_node(policy->nodes); case MPOL_INTERLEAVE: return interleave_nodes(policy); case MPOL_WEIGHTED_INTERLEAVE: return weighted_interleave_nodes(policy); case MPOL_BIND: case MPOL_PREFERRED_MANY: { struct zoneref *z; /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->nodes); return z->zone ? zone_to_nid(z->zone) : node; } case MPOL_LOCAL: return node; default: BUG(); } } static unsigned int read_once_policy_nodemask(struct mempolicy *pol, nodemask_t *mask) { /* * barrier stabilizes the nodemask locally so that it can be iterated * over safely without concern for changes. Allocators validate node * selection does not violate mems_allowed, so this is safe. */ barrier(); memcpy(mask, &pol->nodes, sizeof(nodemask_t)); barrier(); return nodes_weight(*mask); } static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) { nodemask_t nodemask; unsigned int target, nr_nodes; u8 *table; unsigned int weight_total = 0; u8 weight; int nid; nr_nodes = read_once_policy_nodemask(pol, &nodemask); if (!nr_nodes) return numa_node_id(); rcu_read_lock(); table = rcu_dereference(iw_table); /* calculate the total weight */ for_each_node_mask(nid, nodemask) { /* detect system default usage */ weight = table ? table[nid] : 1; weight = weight ? weight : 1; weight_total += weight; } /* Calculate the node offset based on totals */ target = ilx % weight_total; nid = first_node(nodemask); while (target) { /* detect system default usage */ weight = table ? table[nid] : 1; weight = weight ? weight : 1; if (target < weight) break; target -= weight; nid = next_node_in(nid, nodemask); } rcu_read_unlock(); return nid; } /* * Do static interleaving for interleave index @ilx. Returns the ilx'th * node in pol->nodes (starting from ilx=0), wrapping around if ilx * exceeds the number of present nodes. */ static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { nodemask_t nodemask; unsigned int target, nnodes; int i; int nid; nnodes = read_once_policy_nodemask(pol, &nodemask); if (!nnodes) return numa_node_id(); target = ilx % nnodes; nid = first_node(nodemask); for (i = 0; i < target; i++) nid = next_node(nid, nodemask); return nid; } /* * Return a nodemask representing a mempolicy for filtering nodes for * page allocation, together with preferred node id (or the input node id). */ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, pgoff_t ilx, int *nid) { nodemask_t *nodemask = NULL; switch (pol->mode) { case MPOL_PREFERRED: /* Override input node id */ *nid = first_node(pol->nodes); break; case MPOL_PREFERRED_MANY: nodemask = &pol->nodes; if (pol->home_node != NUMA_NO_NODE) *nid = pol->home_node; break; case MPOL_BIND: /* Restrict to nodemask (but not on lower zones) */ if (apply_policy_zone(pol, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&pol->nodes)) nodemask = &pol->nodes; if (pol->home_node != NUMA_NO_NODE) *nid = pol->home_node; /* * __GFP_THISNODE shouldn't even be used with the bind policy * because we might easily break the expectation to stay on the * requested node and not break the policy. */ WARN_ON_ONCE(gfp & __GFP_THISNODE); break; case MPOL_INTERLEAVE: /* Override input node id */ *nid = (ilx == NO_INTERLEAVE_INDEX) ? interleave_nodes(pol) : interleave_nid(pol, ilx); break; case MPOL_WEIGHTED_INTERLEAVE: *nid = (ilx == NO_INTERLEAVE_INDEX) ? weighted_interleave_nodes(pol) : weighted_interleave_nid(pol, ilx); break; } return nodemask; } #ifdef CONFIG_HUGETLBFS /* * huge_node(@vma, @addr, @gfp_flags, @mpol) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy * * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'bind' or 'prefer-many', returns a pointer * to the mempolicy's @nodemask for filtering the zonelist. */ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { pgoff_t ilx; int nid; nid = numa_node_id(); *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); return nid; } /* * init_nodemask_of_mempolicy * * If the current task's mempolicy is "default" [NULL], return 'false' * to indicate default policy. Otherwise, extract the policy nodemask * for 'bind' or 'interleave' policy into the argument nodemask, or * initialize the argument nodemask to contain the single node for * 'preferred' or 'local' policy and return 'true' to indicate presence * of non-default mempolicy. * * We don't bother with reference counting the mempolicy [mpol_get/put] * because the current task is examining it's own mempolicy and a task's * mempolicy is only ever changed by the task itself. * * N.B., it is the caller's responsibility to free a returned nodemask. */ bool init_nodemask_of_mempolicy(nodemask_t *mask) { struct mempolicy *mempolicy; if (!(mask && current->mempolicy)) return false; task_lock(current); mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_WEIGHTED_INTERLEAVE: *mask = mempolicy->nodes; break; case MPOL_LOCAL: init_nodemask_of_node(mask, numa_node_id()); break; default: BUG(); } task_unlock(current); return true; } #endif /* * mempolicy_in_oom_domain * * If tsk's mempolicy is "bind", check for intersection between mask and * the policy nodemask. Otherwise, return true for all other policies * including "interleave", as a tsk with "interleave" policy may have * memory allocated from all nodes in system. * * Takes task_lock(tsk) to prevent freeing of its mempolicy. */ bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask) { struct mempolicy *mempolicy; bool ret = true; if (!mask) return ret; task_lock(tsk); mempolicy = tsk->mempolicy; if (mempolicy && mempolicy->mode == MPOL_BIND) ret = nodes_intersects(mempolicy->nodes, *mask); task_unlock(tsk); return ret; } static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, int nid, nodemask_t *nodemask) { struct page *page; gfp_t preferred_gfp; /* * This is a two pass approach. The first pass will only try the * preferred nodes but skip the direct reclaim and allow the * allocation to fail, while the second pass will try all the * nodes in system. */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask); if (!page) page = __alloc_pages_noprof(gfp, order, nid, NULL); return page; } /** * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. * @gfp: GFP flags. * @order: Order of the page allocation. * @pol: Pointer to the NUMA mempolicy. * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). * @nid: Preferred node (usually numa_node_id() but @mpol may override it). * * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; struct page *page; nodemask = policy_nodemask(gfp, pol, ilx, &nid); if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_preferred_many(gfp, order, nid, nodemask); if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && /* filter "hugepage" allocation, unless from alloc_pages() */ order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred * node) we only try to allocate from the current/preferred * node and don't fall back to other nodes, as the cost of * remote accesses would likely offset THP benefits. * * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ if (pol->mode != MPOL_INTERLEAVE && pol->mode != MPOL_WEIGHTED_INTERLEAVE && (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ page = __alloc_pages_node_noprof(nid, gfp | __GFP_THISNODE | __GFP_NORETRY, order); if (page || !(gfp & __GFP_DIRECT_RECLAIM)) return page; /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ } } page = __alloc_pages_noprof(gfp, order, nid, nodemask); if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) { /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ if (static_branch_likely(&vm_numa_stat_key) && page_to_nid(page) == nid) { preempt_disable(); __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); preempt_enable(); } } return page; } /** * vma_alloc_folio - Allocate a folio for a VMA. * @gfp: GFP flags. * @order: Order of the folio. * @vma: Pointer to VMA. * @addr: Virtual address of the allocation. Must be inside @vma. * @hugepage: Unused (was: For hugepages try only preferred node if possible). * * Allocate a folio for a specific address in @vma, using the appropriate * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the * VMA to prevent it from going away. Should be used for all allocations * for folios that will be mapped into user space, excepting hugetlbfs, and * excepting where direct use of alloc_pages_mpol() is more appropriate. * * Return: The folio on success or NULL if allocation fails. */ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage) { struct mempolicy *pol; pgoff_t ilx; struct page *page; pol = get_vma_policy(vma, addr, order, &ilx); page = alloc_pages_mpol_noprof(gfp | __GFP_COMP, order, pol, ilx, numa_node_id()); mpol_cond_put(pol); return page_rmappable_folio(page); } EXPORT_SYMBOL(vma_alloc_folio_noprof); /** * alloc_pages - Allocate pages. * @gfp: GFP flags. * @order: Power of two of number of pages to allocate. * * Allocate 1 << @order contiguous pages. The physical address of the * first page is naturally aligned (eg an order-3 allocation will be aligned * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current * process is honoured when in process context. * * Context: Can be called from any context, providing the appropriate GFP * flags are used. * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) { struct mempolicy *pol = &default_policy; /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX, numa_node_id()); } EXPORT_SYMBOL(alloc_pages_noprof); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); } EXPORT_SYMBOL(folio_alloc_noprof); static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { int nodes; unsigned long nr_pages_per_node; int delta; int i; unsigned long nr_allocated; unsigned long total_allocated = 0; nodes = nodes_weight(pol->nodes); nr_pages_per_node = nr_pages / nodes; delta = nr_pages - nodes * nr_pages_per_node; for (i = 0; i < nodes; i++) { if (delta) { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node + 1, NULL, page_array); delta--; } else { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node, NULL, page_array); } page_array += nr_allocated; total_allocated += nr_allocated; } return total_allocated; } static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { struct task_struct *me = current; unsigned int cpuset_mems_cookie; unsigned long total_allocated = 0; unsigned long nr_allocated = 0; unsigned long rounds; unsigned long node_pages, delta; u8 *table, *weights, weight; unsigned int weight_total = 0; unsigned long rem_pages = nr_pages; nodemask_t nodes; int nnodes, node; int resume_node = MAX_NUMNODES - 1; u8 resume_weight = 0; int prev_node; int i; if (!nr_pages) return 0; /* read the nodes onto the stack, retry if done during rebind */ do { cpuset_mems_cookie = read_mems_allowed_begin(); nnodes = read_once_policy_nodemask(pol, &nodes); } while (read_mems_allowed_retry(cpuset_mems_cookie)); /* if the nodemask has become invalid, we cannot do anything */ if (!nnodes) return 0; /* Continue allocating from most recent node and adjust the nr_pages */ node = me->il_prev; weight = me->il_weight; if (weight && node_isset(node, nodes)) { node_pages = min(rem_pages, weight); nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, NULL, page_array); page_array += nr_allocated; total_allocated += nr_allocated; /* if that's all the pages, no need to interleave */ if (rem_pages <= weight) { me->il_weight -= rem_pages; return total_allocated; } /* Otherwise we adjust remaining pages, continue from there */ rem_pages -= weight; } /* clear active weight in case of an allocation failure */ me->il_weight = 0; prev_node = node; /* create a local copy of node weights to operate on outside rcu */ weights = kzalloc(nr_node_ids, GFP_KERNEL); if (!weights) return total_allocated; rcu_read_lock(); table = rcu_dereference(iw_table); if (table) memcpy(weights, table, nr_node_ids); rcu_read_unlock(); /* calculate total, detect system default usage */ for_each_node_mask(node, nodes) { if (!weights[node]) weights[node] = 1; weight_total += weights[node]; } /* * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. * Track which node weighted interleave should resume from. * * if (rounds > 0) and (delta == 0), resume_node will always be * the node following prev_node and its weight. */ rounds = rem_pages / weight_total; delta = rem_pages % weight_total; resume_node = next_node_in(prev_node, nodes); resume_weight = weights[resume_node]; for (i = 0; i < nnodes; i++) { node = next_node_in(prev_node, nodes); weight = weights[node]; node_pages = weight * rounds; /* If a delta exists, add this node's portion of the delta */ if (delta > weight) { node_pages += weight; delta -= weight; } else if (delta) { /* when delta is depleted, resume from that node */ node_pages += delta; resume_node = node; resume_weight = weight - delta; delta = 0; } /* node_pages can be 0 if an allocation fails and rounds == 0 */ if (!node_pages) break; nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, NULL, page_array); page_array += nr_allocated; total_allocated += nr_allocated; if (total_allocated == nr_pages) break; prev_node = node; } me->il_prev = resume_node; me->il_weight = resume_weight; kfree(weights); return total_allocated; } static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { gfp_t preferred_gfp; unsigned long nr_allocated = 0; preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, nr_pages, NULL, page_array); if (nr_allocated < nr_pages) nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, nr_pages - nr_allocated, NULL, page_array + nr_allocated); return nr_allocated; } /* alloc pages bulk and mempolicy should be considered at the * same time in some situation such as vmalloc. * * It can accelerate memory allocation especially interleaving * allocate memory. */ unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; nodemask_t *nodemask; int nid; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE) return alloc_pages_bulk_array_interleave(gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) return alloc_pages_bulk_array_weighted_interleave( gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); nid = numa_node_id(); nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); return alloc_pages_bulk_noprof(gfp, nid, nodemask, nr_pages, NULL, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { struct mempolicy *pol = mpol_dup(src->vm_policy); if (IS_ERR(pol)) return PTR_ERR(pol); dst->vm_policy = pol; return 0; } /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). * * current's mempolicy may be rebinded by the other task(the task that changes * cpuset's mems), so we needn't do rebind work for current task. */ /* Slow path of a mempolicy duplicate */ struct mempolicy *__mpol_dup(struct mempolicy *old) { struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); /* task's mempolicy is protected by alloc_lock */ if (old == current->mempolicy) { task_lock(current); *new = *old; task_unlock(current); } else *new = *old; if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); return new; } /* Slow path of a mempolicy comparison */ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (!a || !b) return false; if (a->mode != b->mode) return false; if (a->flags != b->flags) return false; if (a->home_node != b->home_node) return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; switch (a->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_WEIGHTED_INTERLEAVE: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; default: BUG(); return false; } } /* * Shared memory backing store policy support. * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. * They are protected by the sp->lock rwlock, which should be held * for any accesses to the tree. */ /* * lookup first element intersecting start-end. Caller holds sp->lock for * reading or for writing */ static struct sp_node *sp_lookup(struct shared_policy *sp, pgoff_t start, pgoff_t end) { struct rb_node *n = sp->root.rb_node; while (n) { struct sp_node *p = rb_entry(n, struct sp_node, nd); if (start >= p->end) n = n->rb_right; else if (end <= p->start) n = n->rb_left; else break; } if (!n) return NULL; for (;;) { struct sp_node *w = NULL; struct rb_node *prev = rb_prev(n); if (!prev) break; w = rb_entry(prev, struct sp_node, nd); if (w->end <= start) break; n = prev; } return rb_entry(n, struct sp_node, nd); } /* * Insert a new shared policy into the list. Caller holds sp->lock for * writing. */ static void sp_insert(struct shared_policy *sp, struct sp_node *new) { struct rb_node **p = &sp->root.rb_node; struct rb_node *parent = NULL; struct sp_node *nd; while (*p) { parent = *p; nd = rb_entry(parent, struct sp_node, nd); if (new->start < nd->start) p = &(*p)->rb_left; else if (new->end > nd->end) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); } /* Find shared policy intersecting idx */ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { struct mempolicy *pol = NULL; struct sp_node *sn; if (!sp->root.rb_node) return NULL; read_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } read_unlock(&sp->lock); return pol; } static void sp_free(struct sp_node *n) { mpol_put(n->policy); kmem_cache_free(sn_cache, n); } /** * mpol_misplaced - check whether current folio node is valid in policy * * @folio: folio to be checked * @vmf: structure describing the fault * @addr: virtual address in @vma for shared policy lookup and interleave policy * * Lookup current policy node id for vma,addr and "compare to" folio's * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. * * Return: NUMA_NO_NODE if the page is in a node that is valid for this * policy, or a suitable node ID to allocate a replacement folio from. */ int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; struct zoneref *z; int curnid = folio_nid(folio); struct vm_area_struct *vma = vmf->vma; int thiscpu = raw_smp_processor_id(); int thisnid = numa_node_id(); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; /* * Make sure ptl is held so that we don't preempt and we * have a stable smp processor id */ lockdep_assert_held(vmf->ptl); pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: polnid = interleave_nid(pol, ilx); break; case MPOL_WEIGHTED_INTERLEAVE: polnid = weighted_interleave_nid(pol, ilx); break; case MPOL_PREFERRED: if (node_isset(curnid, pol->nodes)) goto out; polnid = first_node(pol->nodes); break; case MPOL_LOCAL: polnid = numa_node_id(); break; case MPOL_BIND: case MPOL_PREFERRED_MANY: /* * Even though MPOL_PREFERRED_MANY can allocate pages outside * policy nodemask we don't allow numa migration to nodes * outside policy nodemask for now. This is done so that if we * want demotion to slow memory to happen, before allocating * from some DRAM node say 'x', we will end up using a * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario * we should not promote to node 'x' from slow memory node. */ if (pol->flags & MPOL_F_MORON) { /* * Optimize placement among multiple nodes * via NUMA balancing */ if (node_isset(thisnid, pol->nodes)) break; goto out; } /* * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. */ if (node_isset(curnid, pol->nodes)) goto out; z = first_zones_zonelist( node_zonelist(thisnid, GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->nodes); polnid = zone_to_nid(z->zone); break; default: BUG(); } /* Migrate the folio towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { polnid = thisnid; if (!should_numa_migrate_memory(current, folio, curnid, thiscpu)) goto out; } if (curnid != polnid) ret = polnid; out: mpol_cond_put(pol); return ret; } /* * Drop the (possibly final) reference to task->mempolicy. It needs to be * dropped after task->mempolicy is set to NULL so that any allocation done as * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed * policy. */ void mpol_put_task_policy(struct task_struct *task) { struct mempolicy *pol; task_lock(task); pol = task->mempolicy; task->mempolicy = NULL; task_unlock(task); mpol_put(pol); } static void sp_delete(struct shared_policy *sp, struct sp_node *n) { rb_erase(&n->nd, &sp->root); sp_free(n); } static void sp_node_init(struct sp_node *node, unsigned long start, unsigned long end, struct mempolicy *pol) { node->start = start; node->end = end; node->policy = pol; } static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { struct sp_node *n; struct mempolicy *newpol; n = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n) return NULL; newpol = mpol_dup(pol); if (IS_ERR(newpol)) { kmem_cache_free(sn_cache, n); return NULL; } newpol->flags |= MPOL_F_SHARED; sp_node_init(n, start, end, newpol); return n; } /* Replace a policy range. */ static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, pgoff_t end, struct sp_node *new) { struct sp_node *n; struct sp_node *n_new = NULL; struct mempolicy *mpol_new = NULL; int ret = 0; restart: write_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { struct rb_node *next = rb_next(&n->nd); if (n->start >= start) { if (n->end <= end) sp_delete(sp, n); else n->start = end; } else { /* Old policy spanning whole new range. */ if (n->end > end) { if (!n_new) goto alloc_new; *mpol_new = *n->policy; atomic_set(&mpol_new->refcnt, 1); sp_node_init(n_new, end, n->end, mpol_new); n->end = start; sp_insert(sp, n_new); n_new = NULL; mpol_new = NULL; break; } else n->end = start; } if (!next) break; n = rb_entry(next, struct sp_node, nd); } if (new) sp_insert(sp, new); write_unlock(&sp->lock); ret = 0; err_out: if (mpol_new) mpol_put(mpol_new); if (n_new) kmem_cache_free(sn_cache, n_new); return ret; alloc_new: write_unlock(&sp->lock); ret = -ENOMEM; n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n_new) goto err_out; mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!mpol_new) goto err_out; atomic_set(&mpol_new->refcnt, 1); goto restart; } /** * mpol_shared_policy_init - initialize shared policy for inode * @sp: pointer to inode shared policy * @mpol: struct mempolicy to install * * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ rwlock_init(&sp->lock); if (mpol) { struct sp_node *sn; struct mempolicy *npol; NODEMASK_SCRATCH(scratch); if (!scratch) goto put_mpol; /* contextualize the tmpfs mount point mempolicy to this file */ npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(npol)) goto free_scratch; /* no valid nodemask intersection */ task_lock(current); ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); task_unlock(current); if (ret) goto put_npol; /* alloc node covering entire file; adds ref to file's npol */ sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); if (sn) sp_insert(sp, sn); put_npol: mpol_put(npol); /* drop initial ref on file's npol */ free_scratch: NODEMASK_SCRATCH_FREE(scratch); put_mpol: mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); if (pol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); if (!new) return -ENOMEM; } err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); if (err && new) sp_free(new); return err; } /* Free a backing policy store on inode delete. */ void mpol_free_shared_policy(struct shared_policy *sp) { struct sp_node *n; struct rb_node *next; if (!sp->root.rb_node) return; write_lock(&sp->lock); next = rb_first(&sp->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(sp, n); } write_unlock(&sp->lock); } #ifdef CONFIG_NUMA_BALANCING static int __initdata numabalancing_override; static void __init check_numabalancing_enable(void) { bool numabalancing_default = false; if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) numabalancing_default = true; /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ if (numabalancing_override) set_numabalancing_state(numabalancing_override == 1); if (num_online_nodes() > 1 && !numabalancing_override) { pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } } static int __init setup_numabalancing(char *str) { int ret = 0; if (!str) goto out; if (!strcmp(str, "enable")) { numabalancing_override = 1; ret = 1; } else if (!strcmp(str, "disable")) { numabalancing_override = -1; ret = 1; } out: if (!ret) pr_warn("Unable to parse numa_balancing=\n"); return ret; } __setup("numa_balancing=", setup_numabalancing); #else static inline void __init check_numabalancing_enable(void) { } #endif /* CONFIG_NUMA_BALANCING */ void __init numa_policy_init(void) { nodemask_t interleave_nodes; unsigned long largest = 0; int nid, prefer = 0; policy_cache = kmem_cache_create("numa_policy", sizeof(struct mempolicy), 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), 0, SLAB_PANIC, NULL); for_each_node(nid) { preferred_node_policy[nid] = (struct mempolicy) { .refcnt = ATOMIC_INIT(1), .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, .nodes = nodemask_of_node(nid), }; } /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or * fall back to the largest node if they're all smaller. */ nodes_clear(interleave_nodes); for_each_node_state(nid, N_MEMORY) { unsigned long total_pages = node_present_pages(nid); /* Preserve the largest node */ if (largest < total_pages) { largest = total_pages; prefer = nid; } /* Interleave this node? */ if ((total_pages << PAGE_SHIFT) >= (16 << 20)) node_set(nid, interleave_nodes); } /* All too small, use the largest */ if (unlikely(nodes_empty(interleave_nodes))) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } /* Reset policy of current process to default */ void numa_default_policy(void) { do_set_mempolicy(MPOL_DEFAULT, 0, NULL); } /* * Parse and format mempolicy from/to strings */ static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", [MPOL_LOCAL] = "local", [MPOL_PREFERRED_MANY] = "prefer (many)", }; #ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. * * Format of input: * <mode>[=<flags>][:<nodelist>] * * Return: %0 on success, else %1 */ int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); int err = 1, mode; if (flags) *flags++ = '\0'; /* terminate mode string */ if (nodelist) { /* NUL-terminate mode or flags string */ *nodelist++ = '\0'; if (nodelist_parse(nodelist, nodes)) goto out; if (!nodes_subset(nodes, node_states[N_MEMORY])) goto out; } else nodes_clear(nodes); mode = match_string(policy_modes, MPOL_MAX, str); if (mode < 0) goto out; switch (mode) { case MPOL_PREFERRED: /* * Insist on a nodelist of one node only, although later * we use first_node(nodes) to grab a single node, so here * nodelist (or nodes) cannot be empty. */ if (nodelist) { char *rest = nodelist; while (isdigit(*rest)) rest++; if (*rest) goto out; if (nodes_empty(nodes)) goto out; } break; case MPOL_INTERLEAVE: case MPOL_WEIGHTED_INTERLEAVE: /* * Default to online nodes with memory if no nodelist */ if (!nodelist) nodes = node_states[N_MEMORY]; break; case MPOL_LOCAL: /* * Don't allow a nodelist; mpol_new() checks flags */ if (nodelist) goto out; break; case MPOL_DEFAULT: /* * Insist on a empty nodelist */ if (!nodelist) err = 0; goto out; case MPOL_PREFERRED_MANY: case MPOL_BIND: /* * Insist on a nodelist */ if (!nodelist) goto out; } mode_flags = 0; if (flags) { /* * Currently, we only support two mutually exclusive * mode flags. */ if (!strcmp(flags, "static")) mode_flags |= MPOL_F_STATIC_NODES; else if (!strcmp(flags, "relative")) mode_flags |= MPOL_F_RELATIVE_NODES; else goto out; } new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) goto out; /* * Save nodes for mpol_to_str() to show the tmpfs mount options * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. */ if (mode != MPOL_PREFERRED) { new->nodes = nodes; } else if (nodelist) { nodes_clear(new->nodes); node_set(first_node(nodes), new->nodes); } else { new->mode = MPOL_LOCAL; } /* * Save nodes for contextualization: this will be used to "clone" * the mempolicy in a specific context [cpuset] at a later time. */ new->w.user_nodemask = nodes; err = 0; out: /* Restore string for error message */ if (nodelist) *--nodelist = ':'; if (flags) *--flags = '='; if (!err) *mpol = new; return err; } #endif /* CONFIG_TMPFS */ /** * mpol_to_str - format a mempolicy structure for printing * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * * Convert @pol into a string. If @buffer is too short, truncate the string. * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the * longest flag, "relative", and to display at least a few node ids. */ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; nodemask_t nodes = NODE_MASK_NONE; unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { mode = pol->mode; flags = pol->flags; } switch (mode) { case MPOL_DEFAULT: case MPOL_LOCAL: break; case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_WEIGHTED_INTERLEAVE: nodes = pol->nodes; break; default: WARN_ON_ONCE(1); snprintf(p, maxlen, "unknown"); return; } p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { p += snprintf(p, buffer + maxlen - p, "="); /* * Currently, the only defined flags are mutually exclusive */ if (flags & MPOL_F_STATIC_NODES) p += snprintf(p, buffer + maxlen - p, "static"); else if (flags & MPOL_F_RELATIVE_NODES) p += snprintf(p, buffer + maxlen - p, "relative"); } if (!nodes_empty(nodes)) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } #ifdef CONFIG_SYSFS struct iw_node_attr { struct kobj_attribute kobj_attr; int nid; }; static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct iw_node_attr *node_attr; u8 weight; node_attr = container_of(attr, struct iw_node_attr, kobj_attr); weight = get_il_weight(node_attr->nid); return sysfs_emit(buf, "%d\n", weight); } static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct iw_node_attr *node_attr; u8 *new; u8 *old; u8 weight = 0; node_attr = container_of(attr, struct iw_node_attr, kobj_attr); if (count == 0 || sysfs_streq(buf, "")) weight = 0; else if (kstrtou8(buf, 0, &weight)) return -EINVAL; new = kzalloc(nr_node_ids, GFP_KERNEL); if (!new) return -ENOMEM; mutex_lock(&iw_table_lock); old = rcu_dereference_protected(iw_table, lockdep_is_held(&iw_table_lock)); if (old) memcpy(new, old, nr_node_ids); new[node_attr->nid] = weight; rcu_assign_pointer(iw_table, new); mutex_unlock(&iw_table_lock); synchronize_rcu(); kfree(old); return count; } static struct iw_node_attr **node_attrs; static void sysfs_wi_node_release(struct iw_node_attr *node_attr, struct kobject *parent) { if (!node_attr) return; sysfs_remove_file(parent, &node_attr->kobj_attr.attr); kfree(node_attr->kobj_attr.attr.name); kfree(node_attr); } static void sysfs_wi_release(struct kobject *wi_kobj) { int i; for (i = 0; i < nr_node_ids; i++) sysfs_wi_node_release(node_attrs[i], wi_kobj); kobject_put(wi_kobj); } static const struct kobj_type wi_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = sysfs_wi_release, }; static int add_weight_node(int nid, struct kobject *wi_kobj) { struct iw_node_attr *node_attr; char *name; node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); if (!node_attr) return -ENOMEM; name = kasprintf(GFP_KERNEL, "node%d", nid); if (!name) { kfree(node_attr); return -ENOMEM; } sysfs_attr_init(&node_attr->kobj_attr.attr); node_attr->kobj_attr.attr.name = name; node_attr->kobj_attr.attr.mode = 0644; node_attr->kobj_attr.show = node_show; node_attr->kobj_attr.store = node_store; node_attr->nid = nid; if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) { kfree(node_attr->kobj_attr.attr.name); kfree(node_attr); pr_err("failed to add attribute to weighted_interleave\n"); return -ENOMEM; } node_attrs[nid] = node_attr; return 0; } static int add_weighted_interleave_group(struct kobject *root_kobj) { struct kobject *wi_kobj; int nid, err; wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); if (!wi_kobj) return -ENOMEM; err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj, "weighted_interleave"); if (err) { kfree(wi_kobj); return err; } for_each_node_state(nid, N_POSSIBLE) { err = add_weight_node(nid, wi_kobj); if (err) { pr_err("failed to add sysfs [node%d]\n", nid); break; } } if (err) kobject_put(wi_kobj); return 0; } static void mempolicy_kobj_release(struct kobject *kobj) { u8 *old; mutex_lock(&iw_table_lock); old = rcu_dereference_protected(iw_table, lockdep_is_held(&iw_table_lock)); rcu_assign_pointer(iw_table, NULL); mutex_unlock(&iw_table_lock); synchronize_rcu(); kfree(old); kfree(node_attrs); kfree(kobj); } static const struct kobj_type mempolicy_ktype = { .release = mempolicy_kobj_release }; static int __init mempolicy_sysfs_init(void) { int err; static struct kobject *mempolicy_kobj; mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL); if (!mempolicy_kobj) { err = -ENOMEM; goto err_out; } node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *), GFP_KERNEL); if (!node_attrs) { err = -ENOMEM; goto mempol_out; } err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj, "mempolicy"); if (err) goto node_out; err = add_weighted_interleave_group(mempolicy_kobj); if (err) { pr_err("mempolicy sysfs structure failed to initialize\n"); kobject_put(mempolicy_kobj); return err; } return err; node_out: kfree(node_attrs); mempol_out: kfree(mempolicy_kobj); err_out: pr_err("failed to add mempolicy kobject to the system\n"); return err; } late_initcall(mempolicy_sysfs_init); #endif /* CONFIG_SYSFS */
10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // SPDX-License-Identifier: GPL-2.0-or-later /* mpihelp-mul_2.c - MPI helper functions * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc. * * This file is part of GnuPG. * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. * The GNU MP Library itself is published under the LGPL; * however I decided to publish this code under the plain GPL. */ #include "mpi-internal.h" #include "longlong.h" mpi_limb_t mpihelp_addmul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb) { mpi_limb_t cy_limb; mpi_size_t j; mpi_limb_t prod_high, prod_low; mpi_limb_t x; /* The loop counter and index J goes from -SIZE to -1. This way * the loop becomes faster. */ j = -s1_size; res_ptr -= j; s1_ptr -= j; cy_limb = 0; do { umul_ppmm(prod_high, prod_low, s1_ptr[j], s2_limb); prod_low += cy_limb; cy_limb = (prod_low < cy_limb ? 1 : 0) + prod_high; x = res_ptr[j]; prod_low = x + prod_low; cy_limb += prod_low < x ? 1 : 0; res_ptr[j] = prod_low; } while (++j); return cy_limb; }
5 12 11 11 11 11 11 9 9 16 12 12 12 9 7 6 6 4 2 2 4 4 4 4 6 16 8 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 /* * llc_input.c - Minimal input path for LLC * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/llc.h> #include <net/llc_pdu.h> #include <net/llc_sap.h> #if 0 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) #endif /* * Packet handler for the station, registerable because in the minimal * LLC core that is taking shape only the very minimal subset of LLC that * is needed for things like IPX, Appletalk, etc will stay, with all the * rest in the llc1 and llc2 modules. */ static void (*llc_station_handler)(struct sk_buff *skb); /* * Packet handlers for LLC_DEST_SAP and LLC_DEST_CONN. */ static void (*llc_type_handlers[2])(struct llc_sap *sap, struct sk_buff *skb); void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)) { smp_wmb(); /* ensure initialisation is complete before it's called */ if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = handler; } void llc_remove_pack(int type) { if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = NULL; synchronize_net(); } void llc_set_station_handler(void (*handler)(struct sk_buff *skb)) { /* Ensure initialisation is complete before it's called */ if (handler) smp_wmb(); llc_station_handler = handler; if (!handler) synchronize_net(); } /** * llc_pdu_type - returns which LLC component must handle for PDU * @skb: input skb * * This function returns which LLC component must handle this PDU. */ static __inline__ int llc_pdu_type(struct sk_buff *skb) { int type = LLC_DEST_CONN; /* I-PDU or S-PDU type */ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) != LLC_PDU_TYPE_U) goto out; switch (LLC_U_PDU_CMD(pdu)) { case LLC_1_PDU_CMD_XID: case LLC_1_PDU_CMD_UI: case LLC_1_PDU_CMD_TEST: type = LLC_DEST_SAP; break; case LLC_2_PDU_CMD_SABME: case LLC_2_PDU_CMD_DISC: case LLC_2_PDU_RSP_UA: case LLC_2_PDU_RSP_DM: case LLC_2_PDU_RSP_FRMR: break; default: type = LLC_DEST_INVALID; break; } out: return type; } /** * llc_fixup_skb - initializes skb pointers * @skb: This argument points to incoming skb * * Initializes internal skb pointer to start of network layer by deriving * length of LLC header; finds length of LLC control field in LLC header * by looking at the two lowest-order bits of the first control field * byte; field is either 3 or 4 bytes long. */ static inline int llc_fixup_skb(struct sk_buff *skb) { u8 llc_len = 2; struct llc_pdu_un *pdu; if (unlikely(!pskb_may_pull(skb, sizeof(*pdu)))) return 0; pdu = (struct llc_pdu_un *)skb->data; if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) == LLC_PDU_TYPE_U) llc_len = 1; llc_len += 2; if (unlikely(!pskb_may_pull(skb, llc_len))) return 0; skb->transport_header += llc_len; skb_pull(skb, llc_len); if (skb->protocol == htons(ETH_P_802_2)) { __be16 pdulen; s32 data_size; if (skb->mac_len < ETH_HLEN) return 0; pdulen = eth_hdr(skb)->h_proto; data_size = ntohs(pdulen) - llc_len; if (data_size < 0 || !pskb_may_pull(skb, data_size)) return 0; if (unlikely(pskb_trim_rcsum(skb, data_size))) return 0; } return 1; } /** * llc_rcv - 802.2 entry point from net lower layers * @skb: received pdu * @dev: device that receive pdu * @pt: packet type * @orig_dev: the original receive net device * * When the system receives a 802.2 frame this function is called. It * checks SAP and connection of received pdu and passes frame to * llc_{station,sap,conn}_rcv for sending to proper state machine. If * the frame is related to a busy connection (a connection is sending * data now), it queues this frame in the connection's backlog. */ int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct llc_sap *sap; struct llc_pdu_sn *pdu; int dest; int (*rcv)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); void (*sta_handler)(struct sk_buff *skb); void (*sap_handler)(struct llc_sap *sap, struct sk_buff *skb); /* * When the interface is in promisc. mode, drop all the crap that it * receives, do not try to analyse it. */ if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { dprintk("%s: PACKET_OTHERHOST\n", __func__); goto drop; } skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto out; if (unlikely(!llc_fixup_skb(skb))) goto drop; pdu = llc_pdu_sn_hdr(skb); if (unlikely(!pdu->dsap)) /* NULL DSAP, refer to station */ goto handle_station; sap = llc_sap_find(pdu->dsap); if (unlikely(!sap)) {/* unknown SAP */ dprintk("%s: llc_sap_find(%02X) failed!\n", __func__, pdu->dsap); goto drop; } /* * First the upper layer protocols that don't need the full * LLC functionality */ rcv = rcu_dereference(sap->rcv_func); dest = llc_pdu_type(skb); sap_handler = dest ? READ_ONCE(llc_type_handlers[dest - 1]) : NULL; if (unlikely(!sap_handler)) { if (rcv) rcv(skb, dev, pt, orig_dev); else kfree_skb(skb); } else { if (rcv) { struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC); if (cskb) rcv(cskb, dev, pt, orig_dev); } sap_handler(sap, skb); } llc_sap_put(sap); out: return 0; drop: kfree_skb(skb); goto out; handle_station: sta_handler = READ_ONCE(llc_station_handler); if (!sta_handler) goto drop; sta_handler(skb); goto out; } EXPORT_SYMBOL(llc_add_pack); EXPORT_SYMBOL(llc_remove_pack); EXPORT_SYMBOL(llc_set_station_handler);
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 // SPDX-License-Identifier: GPL-2.0-or-later /* PKCS#7 parser * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "PKCS7: "fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/oid_registry.h> #include <crypto/public_key.h> #include "pkcs7_parser.h" #include "pkcs7.asn1.h" MODULE_DESCRIPTION("PKCS#7 parser"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); struct pkcs7_parse_context { struct pkcs7_message *msg; /* Message being constructed */ struct pkcs7_signed_info *sinfo; /* SignedInfo being constructed */ struct pkcs7_signed_info **ppsinfo; struct x509_certificate *certs; /* Certificate cache */ struct x509_certificate **ppcerts; unsigned long data; /* Start of data */ enum OID last_oid; /* Last OID encountered */ unsigned x509_index; unsigned sinfo_index; const void *raw_serial; unsigned raw_serial_size; unsigned raw_issuer_size; const void *raw_issuer; const void *raw_skid; unsigned raw_skid_size; bool expect_skid; }; /* * Free a signed information block. */ static void pkcs7_free_signed_info(struct pkcs7_signed_info *sinfo) { if (sinfo) { public_key_signature_free(sinfo->sig); kfree(sinfo); } } /** * pkcs7_free_message - Free a PKCS#7 message * @pkcs7: The PKCS#7 message to free */ void pkcs7_free_message(struct pkcs7_message *pkcs7) { struct x509_certificate *cert; struct pkcs7_signed_info *sinfo; if (pkcs7) { while (pkcs7->certs) { cert = pkcs7->certs; pkcs7->certs = cert->next; x509_free_certificate(cert); } while (pkcs7->crl) { cert = pkcs7->crl; pkcs7->crl = cert->next; x509_free_certificate(cert); } while (pkcs7->signed_infos) { sinfo = pkcs7->signed_infos; pkcs7->signed_infos = sinfo->next; pkcs7_free_signed_info(sinfo); } kfree(pkcs7); } } EXPORT_SYMBOL_GPL(pkcs7_free_message); /* * Check authenticatedAttributes are provided or not provided consistently. */ static int pkcs7_check_authattrs(struct pkcs7_message *msg) { struct pkcs7_signed_info *sinfo; bool want = false; sinfo = msg->signed_infos; if (!sinfo) goto inconsistent; if (sinfo->authattrs) { want = true; msg->have_authattrs = true; } for (sinfo = sinfo->next; sinfo; sinfo = sinfo->next) if (!!sinfo->authattrs != want) goto inconsistent; return 0; inconsistent: pr_warn("Inconsistently supplied authAttrs\n"); return -EINVAL; } /** * pkcs7_parse_message - Parse a PKCS#7 message * @data: The raw binary ASN.1 encoded message to be parsed * @datalen: The size of the encoded message */ struct pkcs7_message *pkcs7_parse_message(const void *data, size_t datalen) { struct pkcs7_parse_context *ctx; struct pkcs7_message *msg = ERR_PTR(-ENOMEM); int ret; ctx = kzalloc(sizeof(struct pkcs7_parse_context), GFP_KERNEL); if (!ctx) goto out_no_ctx; ctx->msg = kzalloc(sizeof(struct pkcs7_message), GFP_KERNEL); if (!ctx->msg) goto out_no_msg; ctx->sinfo = kzalloc(sizeof(struct pkcs7_signed_info), GFP_KERNEL); if (!ctx->sinfo) goto out_no_sinfo; ctx->sinfo->sig = kzalloc(sizeof(struct public_key_signature), GFP_KERNEL); if (!ctx->sinfo->sig) goto out_no_sig; ctx->data = (unsigned long)data; ctx->ppcerts = &ctx->certs; ctx->ppsinfo = &ctx->msg->signed_infos; /* Attempt to decode the signature */ ret = asn1_ber_decoder(&pkcs7_decoder, ctx, data, datalen); if (ret < 0) { msg = ERR_PTR(ret); goto out; } ret = pkcs7_check_authattrs(ctx->msg); if (ret < 0) { msg = ERR_PTR(ret); goto out; } msg = ctx->msg; ctx->msg = NULL; out: while (ctx->certs) { struct x509_certificate *cert = ctx->certs; ctx->certs = cert->next; x509_free_certificate(cert); } out_no_sig: pkcs7_free_signed_info(ctx->sinfo); out_no_sinfo: pkcs7_free_message(ctx->msg); out_no_msg: kfree(ctx); out_no_ctx: return msg; } EXPORT_SYMBOL_GPL(pkcs7_parse_message); /** * pkcs7_get_content_data - Get access to the PKCS#7 content * @pkcs7: The preparsed PKCS#7 message to access * @_data: Place to return a pointer to the data * @_data_len: Place to return the data length * @_headerlen: Size of ASN.1 header not included in _data * * Get access to the data content of the PKCS#7 message. The size of the * header of the ASN.1 object that contains it is also provided and can be used * to adjust *_data and *_data_len to get the entire object. * * Returns -ENODATA if the data object was missing from the message. */ int pkcs7_get_content_data(const struct pkcs7_message *pkcs7, const void **_data, size_t *_data_len, size_t *_headerlen) { if (!pkcs7->data) return -ENODATA; *_data = pkcs7->data; *_data_len = pkcs7->data_len; if (_headerlen) *_headerlen = pkcs7->data_hdrlen; return 0; } EXPORT_SYMBOL_GPL(pkcs7_get_content_data); /* * Note an OID when we find one for later processing when we know how * to interpret it. */ int pkcs7_note_OID(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; ctx->last_oid = look_up_OID(value, vlen); if (ctx->last_oid == OID__NR) { char buffer[50]; sprint_oid(value, vlen, buffer, sizeof(buffer)); printk("PKCS7: Unknown OID: [%lu] %s\n", (unsigned long)value - ctx->data, buffer); } return 0; } /* * Note the digest algorithm for the signature. */ int pkcs7_sig_note_digest_algo(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; switch (ctx->last_oid) { case OID_sha1: ctx->sinfo->sig->hash_algo = "sha1"; break; case OID_sha256: ctx->sinfo->sig->hash_algo = "sha256"; break; case OID_sha384: ctx->sinfo->sig->hash_algo = "sha384"; break; case OID_sha512: ctx->sinfo->sig->hash_algo = "sha512"; break; case OID_sha224: ctx->sinfo->sig->hash_algo = "sha224"; break; case OID_sm3: ctx->sinfo->sig->hash_algo = "sm3"; break; case OID_gost2012Digest256: ctx->sinfo->sig->hash_algo = "streebog256"; break; case OID_gost2012Digest512: ctx->sinfo->sig->hash_algo = "streebog512"; break; case OID_sha3_256: ctx->sinfo->sig->hash_algo = "sha3-256"; break; case OID_sha3_384: ctx->sinfo->sig->hash_algo = "sha3-384"; break; case OID_sha3_512: ctx->sinfo->sig->hash_algo = "sha3-512"; break; default: printk("Unsupported digest algo: %u\n", ctx->last_oid); return -ENOPKG; } return 0; } /* * Note the public key algorithm for the signature. */ int pkcs7_sig_note_pkey_algo(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; switch (ctx->last_oid) { case OID_rsaEncryption: ctx->sinfo->sig->pkey_algo = "rsa"; ctx->sinfo->sig->encoding = "pkcs1"; break; case OID_id_ecdsa_with_sha1: case OID_id_ecdsa_with_sha224: case OID_id_ecdsa_with_sha256: case OID_id_ecdsa_with_sha384: case OID_id_ecdsa_with_sha512: case OID_id_ecdsa_with_sha3_256: case OID_id_ecdsa_with_sha3_384: case OID_id_ecdsa_with_sha3_512: ctx->sinfo->sig->pkey_algo = "ecdsa"; ctx->sinfo->sig->encoding = "x962"; break; case OID_SM2_with_SM3: ctx->sinfo->sig->pkey_algo = "sm2"; ctx->sinfo->sig->encoding = "raw"; break; case OID_gost2012PKey256: case OID_gost2012PKey512: ctx->sinfo->sig->pkey_algo = "ecrdsa"; ctx->sinfo->sig->encoding = "raw"; break; default: printk("Unsupported pkey algo: %u\n", ctx->last_oid); return -ENOPKG; } return 0; } /* * We only support signed data [RFC2315 sec 9]. */ int pkcs7_check_content_type(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; if (ctx->last_oid != OID_signed_data) { pr_warn("Only support pkcs7_signedData type\n"); return -EINVAL; } return 0; } /* * Note the SignedData version */ int pkcs7_note_signeddata_version(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; unsigned version; if (vlen != 1) goto unsupported; ctx->msg->version = version = *(const u8 *)value; switch (version) { case 1: /* PKCS#7 SignedData [RFC2315 sec 9.1] * CMS ver 1 SignedData [RFC5652 sec 5.1] */ break; case 3: /* CMS ver 3 SignedData [RFC2315 sec 5.1] */ break; default: goto unsupported; } return 0; unsupported: pr_warn("Unsupported SignedData version\n"); return -EINVAL; } /* * Note the SignerInfo version */ int pkcs7_note_signerinfo_version(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; unsigned version; if (vlen != 1) goto unsupported; version = *(const u8 *)value; switch (version) { case 1: /* PKCS#7 SignerInfo [RFC2315 sec 9.2] * CMS ver 1 SignerInfo [RFC5652 sec 5.3] */ if (ctx->msg->version != 1) goto version_mismatch; ctx->expect_skid = false; break; case 3: /* CMS ver 3 SignerInfo [RFC2315 sec 5.3] */ if (ctx->msg->version == 1) goto version_mismatch; ctx->expect_skid = true; break; default: goto unsupported; } return 0; unsupported: pr_warn("Unsupported SignerInfo version\n"); return -EINVAL; version_mismatch: pr_warn("SignedData-SignerInfo version mismatch\n"); return -EBADMSG; } /* * Extract a certificate and store it in the context. */ int pkcs7_extract_cert(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; struct x509_certificate *x509; if (tag != ((ASN1_UNIV << 6) | ASN1_CONS_BIT | ASN1_SEQ)) { pr_debug("Cert began with tag %02x at %lu\n", tag, (unsigned long)ctx - ctx->data); return -EBADMSG; } /* We have to correct for the header so that the X.509 parser can start * from the beginning. Note that since X.509 stipulates DER, there * probably shouldn't be an EOC trailer - but it is in PKCS#7 (which * stipulates BER). */ value -= hdrlen; vlen += hdrlen; if (((u8*)value)[1] == 0x80) vlen += 2; /* Indefinite length - there should be an EOC */ x509 = x509_cert_parse(value, vlen); if (IS_ERR(x509)) return PTR_ERR(x509); x509->index = ++ctx->x509_index; pr_debug("Got cert %u for %s\n", x509->index, x509->subject); pr_debug("- fingerprint %*phN\n", x509->id->len, x509->id->data); *ctx->ppcerts = x509; ctx->ppcerts = &x509->next; return 0; } /* * Save the certificate list */ int pkcs7_note_certificate_list(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; pr_devel("Got cert list (%02x)\n", tag); *ctx->ppcerts = ctx->msg->certs; ctx->msg->certs = ctx->certs; ctx->certs = NULL; ctx->ppcerts = &ctx->certs; return 0; } /* * Note the content type. */ int pkcs7_note_content(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; if (ctx->last_oid != OID_data && ctx->last_oid != OID_msIndirectData) { pr_warn("Unsupported data type %d\n", ctx->last_oid); return -EINVAL; } ctx->msg->data_type = ctx->last_oid; return 0; } /* * Extract the data from the message and store that and its content type OID in * the context. */ int pkcs7_note_data(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; pr_debug("Got data\n"); ctx->msg->data = value; ctx->msg->data_len = vlen; ctx->msg->data_hdrlen = hdrlen; return 0; } /* * Parse authenticated attributes. */ int pkcs7_sig_note_authenticated_attr(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; struct pkcs7_signed_info *sinfo = ctx->sinfo; enum OID content_type; pr_devel("AuthAttr: %02x %zu [%*ph]\n", tag, vlen, (unsigned)vlen, value); switch (ctx->last_oid) { case OID_contentType: if (__test_and_set_bit(sinfo_has_content_type, &sinfo->aa_set)) goto repeated; content_type = look_up_OID(value, vlen); if (content_type != ctx->msg->data_type) { pr_warn("Mismatch between global data type (%d) and sinfo %u (%d)\n", ctx->msg->data_type, sinfo->index, content_type); return -EBADMSG; } return 0; case OID_signingTime: if (__test_and_set_bit(sinfo_has_signing_time, &sinfo->aa_set)) goto repeated; /* Should we check that the signing time is consistent * with the signer's X.509 cert? */ return x509_decode_time(&sinfo->signing_time, hdrlen, tag, value, vlen); case OID_messageDigest: if (__test_and_set_bit(sinfo_has_message_digest, &sinfo->aa_set)) goto repeated; if (tag != ASN1_OTS) return -EBADMSG; sinfo->msgdigest = value; sinfo->msgdigest_len = vlen; return 0; case OID_smimeCapabilites: if (__test_and_set_bit(sinfo_has_smime_caps, &sinfo->aa_set)) goto repeated; if (ctx->msg->data_type != OID_msIndirectData) { pr_warn("S/MIME Caps only allowed with Authenticode\n"); return -EKEYREJECTED; } return 0; /* Microsoft SpOpusInfo seems to be contain cont[0] 16-bit BE * char URLs and cont[1] 8-bit char URLs. * * Microsoft StatementType seems to contain a list of OIDs that * are also used as extendedKeyUsage types in X.509 certs. */ case OID_msSpOpusInfo: if (__test_and_set_bit(sinfo_has_ms_opus_info, &sinfo->aa_set)) goto repeated; goto authenticode_check; case OID_msStatementType: if (__test_and_set_bit(sinfo_has_ms_statement_type, &sinfo->aa_set)) goto repeated; authenticode_check: if (ctx->msg->data_type != OID_msIndirectData) { pr_warn("Authenticode AuthAttrs only allowed with Authenticode\n"); return -EKEYREJECTED; } /* I'm not sure how to validate these */ return 0; default: return 0; } repeated: /* We permit max one item per AuthenticatedAttribute and no repeats */ pr_warn("Repeated/multivalue AuthAttrs not permitted\n"); return -EKEYREJECTED; } /* * Note the set of auth attributes for digestion purposes [RFC2315 sec 9.3] */ int pkcs7_sig_note_set_of_authattrs(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; struct pkcs7_signed_info *sinfo = ctx->sinfo; if (!test_bit(sinfo_has_content_type, &sinfo->aa_set) || !test_bit(sinfo_has_message_digest, &sinfo->aa_set)) { pr_warn("Missing required AuthAttr\n"); return -EBADMSG; } if (ctx->msg->data_type != OID_msIndirectData && test_bit(sinfo_has_ms_opus_info, &sinfo->aa_set)) { pr_warn("Unexpected Authenticode AuthAttr\n"); return -EBADMSG; } /* We need to switch the 'CONT 0' to a 'SET OF' when we digest */ sinfo->authattrs = value - (hdrlen - 1); sinfo->authattrs_len = vlen + (hdrlen - 1); return 0; } /* * Note the issuing certificate serial number */ int pkcs7_sig_note_serial(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; ctx->raw_serial = value; ctx->raw_serial_size = vlen; return 0; } /* * Note the issuer's name */ int pkcs7_sig_note_issuer(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; ctx->raw_issuer = value; ctx->raw_issuer_size = vlen; return 0; } /* * Note the issuing cert's subjectKeyIdentifier */ int pkcs7_sig_note_skid(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; pr_devel("SKID: %02x %zu [%*ph]\n", tag, vlen, (unsigned)vlen, value); ctx->raw_skid = value; ctx->raw_skid_size = vlen; return 0; } /* * Note the signature data */ int pkcs7_sig_note_signature(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; ctx->sinfo->sig->s = kmemdup(value, vlen, GFP_KERNEL); if (!ctx->sinfo->sig->s) return -ENOMEM; ctx->sinfo->sig->s_size = vlen; return 0; } /* * Note a signature information block */ int pkcs7_note_signed_info(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; struct pkcs7_signed_info *sinfo = ctx->sinfo; struct asymmetric_key_id *kid; if (ctx->msg->data_type == OID_msIndirectData && !sinfo->authattrs) { pr_warn("Authenticode requires AuthAttrs\n"); return -EBADMSG; } /* Generate cert issuer + serial number key ID */ if (!ctx->expect_skid) { kid = asymmetric_key_generate_id(ctx->raw_serial, ctx->raw_serial_size, ctx->raw_issuer, ctx->raw_issuer_size); } else { kid = asymmetric_key_generate_id(ctx->raw_skid, ctx->raw_skid_size, "", 0); } if (IS_ERR(kid)) return PTR_ERR(kid); pr_devel("SINFO KID: %u [%*phN]\n", kid->len, kid->len, kid->data); sinfo->sig->auth_ids[0] = kid; sinfo->index = ++ctx->sinfo_index; *ctx->ppsinfo = sinfo; ctx->ppsinfo = &sinfo->next; ctx->sinfo = kzalloc(sizeof(struct pkcs7_signed_info), GFP_KERNEL); if (!ctx->sinfo) return -ENOMEM; ctx->sinfo->sig = kzalloc(sizeof(struct public_key_signature), GFP_KERNEL); if (!ctx->sinfo->sig) return -ENOMEM; return 0; }
51 51 42 11 51 12 11 12 12 37 11 11 41 41 40 1 40 41 41 15 15 18 18 2 16 18 18 3 18 5 10 12 8 7 8 7 8 8 8 8 10 10 10 9 10 10 3 2 1 1 1 3 3 3 3 2 2 2 2 2 2 2 2 2 2 3 1 2 1 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 8 2 1 1 1 1 1 1 1 8 8 13 3 3 8 1 8 2 2 2 12 2 2 2 12 10 2 2 8 3 2 1 10 6 6 6 4 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 5 5 5 5 5 12 30 12 18 12 3 11 4 1 1 5 11 1 11 30 11 11 11 2 2 2 2 2 2 9 9 1 4 7 7 1 7 8 8 7 1 9 8 9 1 1 1 1 1 1 1 1 13 13 1 13 12 3 3 1 1 13 5 4 5 4 3 2 11 11 10 11 11 5 11 11 12 12 11 11 11 11 1 1 13 1 1 1 5 1 1 15 15 5 4 5 5 5 14 55 50 45 47 56 28 2 1 56 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 2 2 107 107 107 8 107 1 1 1 3 177 177 97 37 31 15 17 10 98 95 95 26 26 6 26 94 96 96 2 96 95 11 96 96 96 2 6 96 174 23 23 10 7 5 2 9 10 2 10 9 5 10 1 10 4 6 1 5 10 10 8 5 2 2 2 2 2 2 2 5 2 10 10 10 10 8 7 2 10 8 8 10 2 2 10 9 9 8 2 10 10 10 8 8 8 10 8 2 2 10 2 23 1 35 28 11 35 36 35 1 35 35 1 1 2 2 2 1 1 1 1 1 2 62 59 59 65 62 12 53 52 51 7 6 55 61 60 61 53 36 60 1 1 60 27 60 32 60 9 51 34 7 4 58 58 58 1 57 56 1 56 56 35 32 11 52 10 6 42 42 8 25 61 66 62 65 7 6 6 6 6 5 6 6 6 6 1 6 6 1 1 6 6 10 8 10 10 10 10 9 10 9 10 3 3 3 3 10 10 1 10 10 2 9 9 5 5 10 1 1 11 10 11 34 34 33 33 33 33 33 29 33 33 33 33 34 6 4 5 1 3 3 3 2 2 2 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 2 3 6 1 1 4 3 3 3 1 2 3 4 4 1 4 5 4 4 4 1 1 1 4 1 5 1 1 1 1 56 56 55 7 6 7 7 7 6 5 41 13 9 9 4 4 13 13 1 1 1 2 2 2 2 2 1 4 3 2 1 1 1 1 1 1 1 1 2 2 2 1 7 6 6 6 5 6 6 6 10 13 13 13 10 10 2 6 1 1 1 1 1 1 1 1 1 1 1 1 7 7 6 7 7 1 1 1 1 1 1 1 1 1 1 6 5 247 247 246 58 8 11 34 2 1 2 247 40 1 1 1 1 40 7 1 1 7 7 7 3 3 3 1 2 2 1 1 1 1 1 1 11 11 11 11 14 14 13 13 14 12 12 8 4 12 12 12 4 12 12 12 11 11 11 11 11 4 3 1 1 1 7 5 2 12 11 12 11 12 8 12 12 12 11 12 1 1 14 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * PACKET - implements raw packet sockets. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * * Fixes: * Alan Cox : verify_area() now used correctly * Alan Cox : new skbuff lists, look ma no backlogs! * Alan Cox : tidied skbuff lists. * Alan Cox : Now uses generic datagram routines I * added. Also fixed the peek/read crash * from all old Linux datagram code. * Alan Cox : Uses the improved datagram code. * Alan Cox : Added NULL's for socket options. * Alan Cox : Re-commented the code. * Alan Cox : Use new kernel side addressing * Rob Janssen : Correct MTU usage. * Dave Platt : Counter leaks caused by incorrect * interrupt locking and some slightly * dubious gcc output. Can you read * compiler: it said _VOLATILE_ * Richard Kooijman : Timestamp fixes. * Alan Cox : New buffers. Use sk->mac.raw. * Alan Cox : sendmsg/recvmsg support. * Alan Cox : Protocol setting support * Alexey Kuznetsov : Untied from IPv4 stack. * Cyrus Durgin : Fixed kerneld for kmod. * Michal Ostrowski : Module initialization cleanup. * Ulises Alonso : Frame number limit removal and * packet_set_ring memory leak. * Eric Biederman : Allow for > 8 byte hardware addresses. * The convention is that longer addresses * will simply extend the hardware address * byte arrays at the end of sockaddr_ll * and packet_mreq. * Johann Baudy : Added TX RING. * Chetan Loke : Implemented TPACKET_V3 block abstraction * layer. * Copyright (C) 2011, <lokec@ccs.neu.edu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ethtool.h> #include <linux/filter.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/capability.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/wireless.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <asm/page.h> #include <asm/cacheflush.h> #include <asm/io.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/module.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/if_vlan.h> #include <linux/virtio_net.h> #include <linux/errqueue.h> #include <linux/net_tstamp.h> #include <linux/percpu.h> #ifdef CONFIG_INET #include <net/inet_common.h> #endif #include <linux/bpf.h> #include <net/compat.h> #include <linux/netfilter_netdev.h> #include "internal.h" /* Assumptions: - If the device has no dev->header_ops->create, there is no LL header visible above the device. In this case, its hard_header_len should be 0. The device may prepend its own header internally. In this case, its needed_headroom should be set to the space needed for it to add its internal header. For example, a WiFi driver pretending to be an Ethernet driver should set its hard_header_len to be the Ethernet header length, and set its needed_headroom to be (the real WiFi header length - the fake Ethernet header length). - packet socket receives packets with pulled ll header, so that SOCK_RAW should push it back. On receive: ----------- Incoming, dev_has_header(dev) == true mac_header -> ll header data -> data Outgoing, dev_has_header(dev) == true mac_header -> ll header data -> ll header Incoming, dev_has_header(dev) == false mac_header -> data However drivers often make it point to the ll header. This is incorrect because the ll header should be invisible to us. data -> data Outgoing, dev_has_header(dev) == false mac_header -> data. ll header is invisible to us. data -> data Resume If dev_has_header(dev) == false we are unable to restore the ll header, because it is invisible to us. On transmit: ------------ dev_has_header(dev) == true mac_header -> ll header data -> ll header dev_has_header(dev) == false (ll header is invisible to us) mac_header -> data data -> data We should set network_header on output to the correct position, packet classifier depends on it. */ /* Private packet socket structures. */ /* identical to struct packet_mreq except it has * a longer address field. */ struct packet_mreq_max { int mr_ifindex; unsigned short mr_type; unsigned short mr_alen; unsigned char mr_address[MAX_ADDR_LEN]; }; union tpacket_uhdr { struct tpacket_hdr *h1; struct tpacket2_hdr *h2; struct tpacket3_hdr *h3; void *raw; }; static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring); #define V3_ALIGNMENT (8) #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) #define BLK_PLUS_PRIV(sz_of_priv) \ (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt) #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) struct packet_sock; static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status); static void packet_increment_head(struct packet_ring_buffer *buff); static int prb_curr_blk_in_use(struct tpacket_block_desc *); static void *prb_dispatch_next_block(struct tpacket_kbdq_core *, struct packet_sock *); static void prb_retire_current_block(struct tpacket_kbdq_core *, struct packet_sock *, unsigned int status); static int prb_queue_frozen(struct tpacket_kbdq_core *); static void prb_open_block(struct tpacket_kbdq_core *, struct tpacket_block_desc *); static void prb_retire_rx_blk_timer_expired(struct timer_list *); static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_clear_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_fill_vlan_info(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void packet_flush_mclist(struct sock *sk); static u16 packet_pick_tx_queue(struct sk_buff *skb); struct packet_skb_cb { union { struct sockaddr_pkt pkt; union { /* Trick: alias skb original length with * ll.sll_family and ll.protocol in order * to save room. */ unsigned int origlen; struct sockaddr_ll ll; }; } sa; }; #define vio_le() virtio_legacy_is_little_endian() #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) #define GET_PBLOCK_DESC(x, bid) \ ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer)) #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \ ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer)) #define GET_NEXT_PRB_BLK_NUM(x) \ (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \ ((x)->kactive_blk_num+1) : 0) static void __fanout_unlink(struct sock *sk, struct packet_sock *po); static void __fanout_link(struct sock *sk, struct packet_sock *po); #ifdef CONFIG_NETFILTER_EGRESS static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb) { struct sk_buff *next, *head = NULL, *tail; int rc; rcu_read_lock(); for (; skb != NULL; skb = next) { next = skb->next; skb_mark_not_on_list(skb); if (!nf_hook_egress(skb, &rc, skb->dev)) continue; if (!head) head = skb; else tail->next = skb; tail = skb; } rcu_read_unlock(); return head; } #endif static int packet_xmit(const struct packet_sock *po, struct sk_buff *skb) { if (!packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS)) return dev_queue_xmit(skb); #ifdef CONFIG_NETFILTER_EGRESS if (nf_hook_egress_active()) { skb = nf_hook_direct_egress(skb); if (!skb) return NET_XMIT_DROP; } #endif return dev_direct_xmit(skb, packet_pick_tx_queue(skb)); } static struct net_device *packet_cached_dev_get(struct packet_sock *po) { struct net_device *dev; rcu_read_lock(); dev = rcu_dereference(po->cached_dev); dev_hold(dev); rcu_read_unlock(); return dev; } static void packet_cached_dev_assign(struct packet_sock *po, struct net_device *dev) { rcu_assign_pointer(po->cached_dev, dev); } static void packet_cached_dev_reset(struct packet_sock *po) { RCU_INIT_POINTER(po->cached_dev, NULL); } static u16 packet_pick_tx_queue(struct sk_buff *skb) { struct net_device *dev = skb->dev; const struct net_device_ops *ops = dev->netdev_ops; int cpu = raw_smp_processor_id(); u16 queue_index; #ifdef CONFIG_XPS skb->sender_cpu = cpu + 1; #endif skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues); if (ops->ndo_select_queue) { queue_index = ops->ndo_select_queue(dev, skb, NULL); queue_index = netdev_cap_txqueue(dev, queue_index); } else { queue_index = netdev_pick_tx(dev, skb, NULL); } return queue_index; } /* __register_prot_hook must be invoked through register_prot_hook * or from a context in which asynchronous accesses to the packet * socket is not possible (packet_create()). */ static void __register_prot_hook(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) { if (po->fanout) __fanout_link(sk, po); else dev_add_pack(&po->prot_hook); sock_hold(sk); packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 1); } } static void register_prot_hook(struct sock *sk) { lockdep_assert_held_once(&pkt_sk(sk)->bind_lock); __register_prot_hook(sk); } /* If the sync parameter is true, we will temporarily drop * the po->bind_lock and do a synchronize_net to make sure no * asynchronous packet processing paths still refer to the elements * of po->prot_hook. If the sync parameter is false, it is the * callers responsibility to take care of this. */ static void __unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); lockdep_assert_held_once(&po->bind_lock); packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0); if (po->fanout) __fanout_unlink(sk, po); else __dev_remove_pack(&po->prot_hook); __sock_put(sk); if (sync) { spin_unlock(&po->bind_lock); synchronize_net(); spin_lock(&po->bind_lock); } } static void unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) __unregister_prot_hook(sk, sync); } static inline struct page * __pure pgv_to_page(void *addr) { if (is_vmalloc_addr(addr)) return vmalloc_to_page(addr); return virt_to_page(addr); } static void __packet_set_status(struct packet_sock *po, void *frame, int status) { union tpacket_uhdr h; /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */ h.raw = frame; switch (po->tp_version) { case TPACKET_V1: WRITE_ONCE(h.h1->tp_status, status); flush_dcache_page(pgv_to_page(&h.h1->tp_status)); break; case TPACKET_V2: WRITE_ONCE(h.h2->tp_status, status); flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: WRITE_ONCE(h.h3->tp_status, status); flush_dcache_page(pgv_to_page(&h.h3->tp_status)); break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); } smp_wmb(); } static int __packet_get_status(const struct packet_sock *po, void *frame) { union tpacket_uhdr h; smp_rmb(); /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */ h.raw = frame; switch (po->tp_version) { case TPACKET_V1: flush_dcache_page(pgv_to_page(&h.h1->tp_status)); return READ_ONCE(h.h1->tp_status); case TPACKET_V2: flush_dcache_page(pgv_to_page(&h.h2->tp_status)); return READ_ONCE(h.h2->tp_status); case TPACKET_V3: flush_dcache_page(pgv_to_page(&h.h3->tp_status)); return READ_ONCE(h.h3->tp_status); default: WARN(1, "TPACKET version not supported.\n"); BUG(); return 0; } } static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts, unsigned int flags) { struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); if (shhwtstamps && (flags & SOF_TIMESTAMPING_RAW_HARDWARE) && ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts)) return TP_STATUS_TS_RAW_HARDWARE; if ((flags & SOF_TIMESTAMPING_SOFTWARE) && ktime_to_timespec64_cond(skb_tstamp(skb), ts)) return TP_STATUS_TS_SOFTWARE; return 0; } static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, struct sk_buff *skb) { union tpacket_uhdr h; struct timespec64 ts; __u32 ts_status; if (!(ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp)))) return 0; h.raw = frame; /* * versions 1 through 3 overflow the timestamps in y2106, since they * all store the seconds in a 32-bit unsigned integer. * If we create a version 4, that should have a 64-bit timestamp, * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit * nanoseconds. */ switch (po->tp_version) { case TPACKET_V1: h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; break; case TPACKET_V2: h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; break; case TPACKET_V3: h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); } /* one flush is safe, as both fields always lie on the same cacheline */ flush_dcache_page(pgv_to_page(&h.h1->tp_sec)); smp_wmb(); return ts_status; } static void *packet_lookup_frame(const struct packet_sock *po, const struct packet_ring_buffer *rb, unsigned int position, int status) { unsigned int pg_vec_pos, frame_offset; union tpacket_uhdr h; pg_vec_pos = position / rb->frames_per_block; frame_offset = position % rb->frames_per_block; h.raw = rb->pg_vec[pg_vec_pos].buffer + (frame_offset * rb->frame_size); if (status != __packet_get_status(po, h.raw)) return NULL; return h.raw; } static void *packet_current_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { return packet_lookup_frame(po, rb, rb->head, status); } static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) { del_timer_sync(&pkc->retire_blk_timer); } static void prb_shutdown_retire_blk_timer(struct packet_sock *po, struct sk_buff_head *rb_queue) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); spin_lock_bh(&rb_queue->lock); pkc->delete_blk_timer = 1; spin_unlock_bh(&rb_queue->lock); prb_del_retire_blk_timer(pkc); } static void prb_setup_retire_blk_timer(struct packet_sock *po) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired, 0); pkc->retire_blk_timer.expires = jiffies; } static int prb_calc_retire_blk_tmo(struct packet_sock *po, int blk_size_in_bytes) { struct net_device *dev; unsigned int mbits, div; struct ethtool_link_ksettings ecmd; int err; rtnl_lock(); dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); if (unlikely(!dev)) { rtnl_unlock(); return DEFAULT_PRB_RETIRE_TOV; } err = __ethtool_get_link_ksettings(dev, &ecmd); rtnl_unlock(); if (err) return DEFAULT_PRB_RETIRE_TOV; /* If the link speed is so slow you don't really * need to worry about perf anyways */ if (ecmd.base.speed < SPEED_1000 || ecmd.base.speed == SPEED_UNKNOWN) return DEFAULT_PRB_RETIRE_TOV; div = ecmd.base.speed / 1000; mbits = (blk_size_in_bytes * 8) / (1024 * 1024); if (div) mbits /= div; if (div) return mbits + 1; return mbits; } static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, union tpacket_req_u *req_u) { p1->feature_req_word = req_u->req3.tp_feature_req_word; } static void init_prb_bdqc(struct packet_sock *po, struct packet_ring_buffer *rb, struct pgv *pg_vec, union tpacket_req_u *req_u) { struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd; memset(p1, 0x0, sizeof(*p1)); p1->knxt_seq_num = 1; p1->pkbdq = pg_vec; pbd = (struct tpacket_block_desc *)pg_vec[0].buffer; p1->pkblk_start = pg_vec[0].buffer; p1->kblk_size = req_u->req3.tp_block_size; p1->knum_blocks = req_u->req3.tp_block_nr; p1->hdrlen = po->tp_hdrlen; p1->version = po->tp_version; p1->last_kactive_blk_num = 0; po->stats.stats3.tp_freeze_q_cnt = 0; if (req_u->req3.tp_retire_blk_tov) p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; else p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, req_u->req3.tp_block_size); p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; rwlock_init(&p1->blk_fill_in_prog_lock); p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); prb_setup_retire_blk_timer(po); prb_open_block(p1, pbd); } /* Do NOT update the last_blk_num first. * Assumes sk_buff_head lock is held. */ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) { mod_timer(&pkc->retire_blk_timer, jiffies + pkc->tov_in_jiffies); pkc->last_kactive_blk_num = pkc->kactive_blk_num; } /* * Timer logic: * 1) We refresh the timer only when we open a block. * By doing this we don't waste cycles refreshing the timer * on packet-by-packet basis. * * With a 1MB block-size, on a 1Gbps line, it will take * i) ~8 ms to fill a block + ii) memcpy etc. * In this cut we are not accounting for the memcpy time. * * So, if the user sets the 'tmo' to 10ms then the timer * will never fire while the block is still getting filled * (which is what we want). However, the user could choose * to close a block early and that's fine. * * But when the timer does fire, we check whether or not to refresh it. * Since the tmo granularity is in msecs, it is not too expensive * to refresh the timer, lets say every '8' msecs. * Either the user can set the 'tmo' or we can derive it based on * a) line-speed and b) block-size. * prb_calc_retire_blk_tmo() calculates the tmo. * */ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) { struct packet_sock *po = from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer); struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); unsigned int frozen; struct tpacket_block_desc *pbd; spin_lock(&po->sk.sk_receive_queue.lock); frozen = prb_queue_frozen(pkc); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); if (unlikely(pkc->delete_blk_timer)) goto out; /* We only need to plug the race when the block is partially filled. * tpacket_rcv: * lock(); increment BLOCK_NUM_PKTS; unlock() * copy_bits() is in progress ... * timer fires on other cpu: * we can't retire the current block because copy_bits * is in progress. * */ if (BLOCK_NUM_PKTS(pbd)) { /* Waiting for skb_copy_bits to finish... */ write_lock(&pkc->blk_fill_in_prog_lock); write_unlock(&pkc->blk_fill_in_prog_lock); } if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { if (!frozen) { if (!BLOCK_NUM_PKTS(pbd)) { /* An empty block. Just refresh the timer. */ goto refresh_timer; } prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); if (!prb_dispatch_next_block(pkc, po)) goto refresh_timer; else goto out; } else { /* Case 1. Queue was frozen because user-space was * lagging behind. */ if (prb_curr_blk_in_use(pbd)) { /* * Ok, user-space is still behind. * So just refresh the timer. */ goto refresh_timer; } else { /* Case 2. queue was frozen,user-space caught up, * now the link went idle && the timer fired. * We don't have a block to close.So we open this * block and restart the timer. * opening a block thaws the queue,restarts timer * Thawing/timer-refresh is a side effect. */ prb_open_block(pkc, pbd); goto out; } } } refresh_timer: _prb_refresh_rx_retire_blk_timer(pkc); out: spin_unlock(&po->sk.sk_receive_queue.lock); } static void prb_flush_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, __u32 status) { /* Flush everything minus the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 u8 *start, *end; start = (u8 *)pbd1; /* Skip the block header(we know header WILL fit in 4K) */ start += PAGE_SIZE; end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end); for (; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif /* Now update the block status. */ BLOCK_STATUS(pbd1) = status; /* Flush the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 start = (u8 *)pbd1; flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif } /* * Side effect: * * 1) flush the block * 2) Increment active_blk_num * * Note:We DONT refresh the timer on purpose. * Because almost always the next block will be opened. */ static void prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, struct packet_sock *po, unsigned int stat) { __u32 status = TP_STATUS_USER | stat; struct tpacket3_hdr *last_pkt; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; struct sock *sk = &po->sk; if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; last_pkt = (struct tpacket3_hdr *)pkc1->prev; last_pkt->tp_next_offset = 0; /* Get the ts of the last pkt */ if (BLOCK_NUM_PKTS(pbd1)) { h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; } else { /* Ok, we tmo'd - so get the current time. * * It shouldn't really happen as we don't close empty * blocks. See prb_retire_rx_blk_timer_expired(). */ struct timespec64 ts; ktime_get_real_ts64(&ts); h1->ts_last_pkt.ts_sec = ts.tv_sec; h1->ts_last_pkt.ts_nsec = ts.tv_nsec; } smp_wmb(); /* Flush the block */ prb_flush_block(pkc1, pbd1, status); sk->sk_data_ready(sk); pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); } static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) { pkc->reset_pending_on_curr_blk = 0; } /* * Side effect of opening a block: * * 1) prb_queue is thawed. * 2) retire_blk_timer is refreshed. * */ static void prb_open_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1) { struct timespec64 ts; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; smp_rmb(); /* We could have just memset this but we will lose the * flexibility of making the priv area sticky */ BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; BLOCK_NUM_PKTS(pbd1) = 0; BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); ktime_get_real_ts64(&ts); h1->ts_first_pkt.ts_sec = ts.tv_sec; h1->ts_first_pkt.ts_nsec = ts.tv_nsec; pkc1->pkblk_start = (char *)pbd1; pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; pbd1->version = pkc1->version; pkc1->prev = pkc1->nxt_offset; pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; prb_thaw_queue(pkc1); _prb_refresh_rx_retire_blk_timer(pkc1); smp_wmb(); } /* * Queue freeze logic: * 1) Assume tp_block_nr = 8 blocks. * 2) At time 't0', user opens Rx ring. * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 * 4) user-space is either sleeping or processing block '0'. * 5) tpacket_rcv is currently filling block '7', since there is no space left, * it will close block-7,loop around and try to fill block '0'. * call-flow: * __packet_lookup_frame_in_block * prb_retire_current_block() * prb_dispatch_next_block() * |->(BLOCK_STATUS == USER) evaluates to true * 5.1) Since block-0 is currently in-use, we just freeze the queue. * 6) Now there are two cases: * 6.1) Link goes idle right after the queue is frozen. * But remember, the last open_block() refreshed the timer. * When this timer expires,it will refresh itself so that we can * re-open block-0 in near future. * 6.2) Link is busy and keeps on receiving packets. This is a simple * case and __packet_lookup_frame_in_block will check if block-0 * is free and can now be re-used. */ static void prb_freeze_queue(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { pkc->reset_pending_on_curr_blk = 1; po->stats.stats3.tp_freeze_q_cnt++; } #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) /* * If the next block is free then we will dispatch it * and return a good offset. * Else, we will freeze the queue. * So, caller must check the return value. */ static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { struct tpacket_block_desc *pbd; smp_rmb(); /* 1. Get current block num */ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* 2. If this block is currently in_use then freeze the queue */ if (TP_STATUS_USER & BLOCK_STATUS(pbd)) { prb_freeze_queue(pkc, po); return NULL; } /* * 3. * open this block and return the offset where the first packet * needs to get stored. */ prb_open_block(pkc, pbd); return (void *)pkc->nxt_offset; } static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po, unsigned int status) { struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* retire/close the current block */ if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { /* * Plug the case where copy_bits() is in progress on * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't * have space to copy the pkt in the current block and * called prb_retire_current_block() * * We don't need to worry about the TMO case because * the timer-handler already handled this case. */ if (!(status & TP_STATUS_BLK_TMO)) { /* Waiting for skb_copy_bits to finish... */ write_lock(&pkc->blk_fill_in_prog_lock); write_unlock(&pkc->blk_fill_in_prog_lock); } prb_close_block(pkc, pbd, po, status); return; } } static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd) { return TP_STATUS_USER & BLOCK_STATUS(pbd); } static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) { return pkc->reset_pending_on_curr_blk; } static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) __releases(&pkc->blk_fill_in_prog_lock) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); read_unlock(&pkc->blk_fill_in_prog_lock); } static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb); } static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = 0; } static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { if (skb_vlan_tag_present(pkc->skb)) { ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb); ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { ppd->hv1.tp_vlan_tci = 0; ppd->hv1.tp_vlan_tpid = 0; ppd->tp_status = TP_STATUS_AVAILABLE; } } static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_padding = 0; prb_fill_vlan_info(pkc, ppd); if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) prb_fill_rxhash(pkc, ppd); else prb_clear_rxhash(pkc, ppd); } static void prb_fill_curr_block(char *curr, struct tpacket_kbdq_core *pkc, struct tpacket_block_desc *pbd, unsigned int len) __acquires(&pkc->blk_fill_in_prog_lock) { struct tpacket3_hdr *ppd; ppd = (struct tpacket3_hdr *)curr; ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len); pkc->prev = curr; pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_NUM_PKTS(pbd) += 1; read_lock(&pkc->blk_fill_in_prog_lock); prb_run_all_ft_ops(pkc, ppd); } /* Assumes caller has the sk->rx_queue.lock */ static void *__packet_lookup_frame_in_block(struct packet_sock *po, struct sk_buff *skb, unsigned int len ) { struct tpacket_kbdq_core *pkc; struct tpacket_block_desc *pbd; char *curr, *end; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* Queue is frozen when user space is lagging behind */ if (prb_queue_frozen(pkc)) { /* * Check if that last block which caused the queue to freeze, * is still in_use by user-space. */ if (prb_curr_blk_in_use(pbd)) { /* Can't record this packet */ return NULL; } else { /* * Ok, the block was released by user-space. * Now let's open that block. * opening a block also thaws the queue. * Thawing is a side effect. */ prb_open_block(pkc, pbd); } } smp_mb(); curr = pkc->nxt_offset; pkc->skb = skb; end = (char *)pbd + pkc->kblk_size; /* first try the current block */ if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) { prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* Ok, close the current block */ prb_retire_current_block(pkc, po, 0); /* Now, try to dispatch the next block */ curr = (char *)prb_dispatch_next_block(pkc, po); if (curr) { pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* * No free blocks are available.user_space hasn't caught up yet. * Queue was just frozen and now this packet will get dropped. */ return NULL; } static void *packet_current_rx_frame(struct packet_sock *po, struct sk_buff *skb, int status, unsigned int len) { char *curr = NULL; switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: curr = packet_lookup_frame(po, &po->rx_ring, po->rx_ring.head, status); return curr; case TPACKET_V3: return __packet_lookup_frame_in_block(po, skb, len); default: WARN(1, "TPACKET version not supported\n"); BUG(); return NULL; } } static void *prb_lookup_block(const struct packet_sock *po, const struct packet_ring_buffer *rb, unsigned int idx, int status) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); if (status != BLOCK_STATUS(pbd)) return NULL; return pbd; } static int prb_previous_blk_num(struct packet_ring_buffer *rb) { unsigned int prev; if (rb->prb_bdqc.kactive_blk_num) prev = rb->prb_bdqc.kactive_blk_num-1; else prev = rb->prb_bdqc.knum_blocks-1; return prev; } /* Assumes caller has held the rx_queue.lock */ static void *__prb_previous_block(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = prb_previous_blk_num(rb); return prb_lookup_block(po, rb, previous, status); } static void *packet_previous_rx_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { if (po->tp_version <= TPACKET_V2) return packet_previous_frame(po, rb, status); return __prb_previous_block(po, rb, status); } static void packet_increment_rx_head(struct packet_sock *po, struct packet_ring_buffer *rb) { switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: return packet_increment_head(rb); case TPACKET_V3: default: WARN(1, "TPACKET version not supported.\n"); BUG(); return; } } static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max; return packet_lookup_frame(po, rb, previous, status); } static void packet_increment_head(struct packet_ring_buffer *buff) { buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; } static void packet_inc_pending(struct packet_ring_buffer *rb) { this_cpu_inc(*rb->pending_refcnt); } static void packet_dec_pending(struct packet_ring_buffer *rb) { this_cpu_dec(*rb->pending_refcnt); } static unsigned int packet_read_pending(const struct packet_ring_buffer *rb) { unsigned int refcnt = 0; int cpu; /* We don't use pending refcount in rx_ring. */ if (rb->pending_refcnt == NULL) return 0; for_each_possible_cpu(cpu) refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu); return refcnt; } static int packet_alloc_pending(struct packet_sock *po) { po->rx_ring.pending_refcnt = NULL; po->tx_ring.pending_refcnt = alloc_percpu(unsigned int); if (unlikely(po->tx_ring.pending_refcnt == NULL)) return -ENOBUFS; return 0; } static void packet_free_pending(struct packet_sock *po) { free_percpu(po->tx_ring.pending_refcnt); } #define ROOM_POW_OFF 2 #define ROOM_NONE 0x0 #define ROOM_LOW 0x1 #define ROOM_NORMAL 0x2 static bool __tpacket_has_room(const struct packet_sock *po, int pow_off) { int idx, len; len = READ_ONCE(po->rx_ring.frame_max) + 1; idx = READ_ONCE(po->rx_ring.head); if (pow_off) idx += len >> pow_off; if (idx >= len) idx -= len; return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off) { int idx, len; len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks); idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num); if (pow_off) idx += len >> pow_off; if (idx >= len) idx -= len; return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } static int __packet_rcv_has_room(const struct packet_sock *po, const struct sk_buff *skb) { const struct sock *sk = &po->sk; int ret = ROOM_NONE; if (po->prot_hook.func != tpacket_rcv) { int rcvbuf = READ_ONCE(sk->sk_rcvbuf); int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc) - (skb ? skb->truesize : 0); if (avail > (rcvbuf >> ROOM_POW_OFF)) return ROOM_NORMAL; else if (avail > 0) return ROOM_LOW; else return ROOM_NONE; } if (po->tp_version == TPACKET_V3) { if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; else if (__tpacket_v3_has_room(po, 0)) ret = ROOM_LOW; } else { if (__tpacket_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; else if (__tpacket_has_room(po, 0)) ret = ROOM_LOW; } return ret; } static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { bool pressure; int ret; ret = __packet_rcv_has_room(po, skb); pressure = ret != ROOM_NORMAL; if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) != pressure) packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, pressure); return ret; } static void packet_rcv_try_clear_pressure(struct packet_sock *po) { if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, false); } static void packet_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_error_queue); WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive packet socket: %p\n", sk); return; } } static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) { u32 *history = po->rollover->history; u32 victim, rxhash; int i, count = 0; rxhash = skb_get_hash(skb); for (i = 0; i < ROLLOVER_HLEN; i++) if (READ_ONCE(history[i]) == rxhash) count++; victim = get_random_u32_below(ROLLOVER_HLEN); /* Avoid dirtying the cache line if possible */ if (READ_ONCE(history[victim]) != rxhash) WRITE_ONCE(history[victim], rxhash); return count > (ROLLOVER_HLEN >> 1); } static unsigned int fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return reciprocal_scale(__skb_get_hash_symmetric(skb), num); } static unsigned int fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { unsigned int val = atomic_inc_return(&f->rr_cur); return val % num; } static unsigned int fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return smp_processor_id() % num; } static unsigned int fanout_demux_rnd(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return get_random_u32_below(num); } static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct sk_buff *skb, unsigned int idx, bool try_self, unsigned int num) { struct packet_sock *po, *po_next, *po_skip = NULL; unsigned int i, j, room = ROOM_NONE; po = pkt_sk(rcu_dereference(f->arr[idx])); if (try_self) { room = packet_rcv_has_room(po, skb); if (room == ROOM_NORMAL || (room == ROOM_LOW && !fanout_flow_is_huge(po, skb))) return idx; po_skip = po; } i = j = min_t(int, po->rollover->sock, num - 1); do { po_next = pkt_sk(rcu_dereference(f->arr[i])); if (po_next != po_skip && !packet_sock_flag(po_next, PACKET_SOCK_PRESSURE) && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; atomic_long_inc(&po->rollover->num); if (room == ROOM_LOW) atomic_long_inc(&po->rollover->num_huge); return i; } if (++i == num) i = 0; } while (i != j); atomic_long_inc(&po->rollover->num_failed); return idx; } static unsigned int fanout_demux_qm(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return skb_get_queue_mapping(skb) % num; } static unsigned int fanout_demux_bpf(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { struct bpf_prog *prog; unsigned int ret = 0; rcu_read_lock(); prog = rcu_dereference(f->bpf_prog); if (prog) ret = bpf_prog_run_clear_cb(prog, skb) % num; rcu_read_unlock(); return ret; } static bool fanout_has_flag(struct packet_fanout *f, u16 flag) { return f->flags & (flag >> 8); } static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct packet_fanout *f = pt->af_packet_priv; unsigned int num = READ_ONCE(f->num_members); struct net *net = read_pnet(&f->net); struct packet_sock *po; unsigned int idx; if (!net_eq(dev_net(dev), net) || !num) { kfree_skb(skb); return 0; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET); if (!skb) return 0; } switch (f->type) { case PACKET_FANOUT_HASH: default: idx = fanout_demux_hash(f, skb, num); break; case PACKET_FANOUT_LB: idx = fanout_demux_lb(f, skb, num); break; case PACKET_FANOUT_CPU: idx = fanout_demux_cpu(f, skb, num); break; case PACKET_FANOUT_RND: idx = fanout_demux_rnd(f, skb, num); break; case PACKET_FANOUT_QM: idx = fanout_demux_qm(f, skb, num); break; case PACKET_FANOUT_ROLLOVER: idx = fanout_demux_rollover(f, skb, 0, false, num); break; case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: idx = fanout_demux_bpf(f, skb, num); break; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) idx = fanout_demux_rollover(f, skb, idx, true, num); po = pkt_sk(rcu_dereference(f->arr[idx])); return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } DEFINE_MUTEX(fanout_mutex); EXPORT_SYMBOL_GPL(fanout_mutex); static LIST_HEAD(fanout_list); static u16 fanout_next_id; static void __fanout_link(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; spin_lock(&f->lock); rcu_assign_pointer(f->arr[f->num_members], sk); smp_wmb(); f->num_members++; if (f->num_members == 1) dev_add_pack(&f->prot_hook); spin_unlock(&f->lock); } static void __fanout_unlink(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; int i; spin_lock(&f->lock); for (i = 0; i < f->num_members; i++) { if (rcu_dereference_protected(f->arr[i], lockdep_is_held(&f->lock)) == sk) break; } BUG_ON(i >= f->num_members); rcu_assign_pointer(f->arr[i], rcu_dereference_protected(f->arr[f->num_members - 1], lockdep_is_held(&f->lock))); f->num_members--; if (f->num_members == 0) __dev_remove_pack(&f->prot_hook); spin_unlock(&f->lock); } static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) { if (sk->sk_family != PF_PACKET) return false; return ptype->af_packet_priv == pkt_sk(sk)->fanout; } static void fanout_init_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_LB: atomic_set(&f->rr_cur, 0); break; case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: RCU_INIT_POINTER(f->bpf_prog, NULL); break; } } static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new) { struct bpf_prog *old; spin_lock(&f->lock); old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock)); rcu_assign_pointer(f->bpf_prog, new); spin_unlock(&f->lock); if (old) { synchronize_net(); bpf_prog_destroy(old); } } static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; struct sock_fprog fprog; int ret; if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; ret = copy_bpf_fprog_from_user(&fprog, data, len); if (ret) return ret; ret = bpf_prog_create_from_user(&new, &fprog, NULL, false); if (ret) return ret; __fanout_set_data_bpf(po->fanout, new); return 0; } static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; u32 fd; if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; if (len != sizeof(fd)) return -EINVAL; if (copy_from_sockptr(&fd, data, len)) return -EFAULT; new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(new)) return PTR_ERR(new); __fanout_set_data_bpf(po->fanout, new); return 0; } static int fanout_set_data(struct packet_sock *po, sockptr_t data, unsigned int len) { switch (po->fanout->type) { case PACKET_FANOUT_CBPF: return fanout_set_data_cbpf(po, data, len); case PACKET_FANOUT_EBPF: return fanout_set_data_ebpf(po, data, len); default: return -EINVAL; } } static void fanout_release_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: __fanout_set_data_bpf(f, NULL); } } static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id) { struct packet_fanout *f; list_for_each_entry(f, &fanout_list, list) { if (f->id == candidate_id && read_pnet(&f->net) == sock_net(sk)) { return false; } } return true; } static bool fanout_find_new_id(struct sock *sk, u16 *new_id) { u16 id = fanout_next_id; do { if (__fanout_id_is_free(sk, id)) { *new_id = id; fanout_next_id = id + 1; return true; } id++; } while (id != fanout_next_id); return false; } static int fanout_add(struct sock *sk, struct fanout_args *args) { struct packet_rollover *rollover = NULL; struct packet_sock *po = pkt_sk(sk); u16 type_flags = args->type_flags; struct packet_fanout *f, *match; u8 type = type_flags & 0xff; u8 flags = type_flags >> 8; u16 id = args->id; int err; switch (type) { case PACKET_FANOUT_ROLLOVER: if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) return -EINVAL; break; case PACKET_FANOUT_HASH: case PACKET_FANOUT_LB: case PACKET_FANOUT_CPU: case PACKET_FANOUT_RND: case PACKET_FANOUT_QM: case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: break; default: return -EINVAL; } mutex_lock(&fanout_mutex); err = -EALREADY; if (po->fanout) goto out; if (type == PACKET_FANOUT_ROLLOVER || (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { err = -ENOMEM; rollover = kzalloc(sizeof(*rollover), GFP_KERNEL); if (!rollover) goto out; atomic_long_set(&rollover->num, 0); atomic_long_set(&rollover->num_huge, 0); atomic_long_set(&rollover->num_failed, 0); } if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { if (id != 0) { err = -EINVAL; goto out; } if (!fanout_find_new_id(sk, &id)) { err = -ENOMEM; goto out; } /* ephemeral flag for the first socket in the group: drop it */ flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8); } match = NULL; list_for_each_entry(f, &fanout_list, list) { if (f->id == id && read_pnet(&f->net) == sock_net(sk)) { match = f; break; } } err = -EINVAL; if (match) { if (match->flags != flags) goto out; if (args->max_num_members && args->max_num_members != match->max_num_members) goto out; } else { if (args->max_num_members > PACKET_FANOUT_MAX) goto out; if (!args->max_num_members) /* legacy PACKET_FANOUT_MAX */ args->max_num_members = 256; err = -ENOMEM; match = kvzalloc(struct_size(match, arr, args->max_num_members), GFP_KERNEL); if (!match) goto out; write_pnet(&match->net, sock_net(sk)); match->id = id; match->type = type; match->flags = flags; INIT_LIST_HEAD(&match->list); spin_lock_init(&match->lock); refcount_set(&match->sk_ref, 0); fanout_init_data(match); match->prot_hook.type = po->prot_hook.type; match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; match->prot_hook.af_packet_net = read_pnet(&match->net); match->prot_hook.id_match = match_fanout_group; match->max_num_members = args->max_num_members; match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING; list_add(&match->list, &fanout_list); } err = -EINVAL; spin_lock(&po->bind_lock); if (packet_sock_flag(po, PACKET_SOCK_RUNNING) && match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; if (refcount_read(&match->sk_ref) < match->max_num_members) { __dev_remove_pack(&po->prot_hook); /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ WRITE_ONCE(po->fanout, match); po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); __fanout_link(sk, po); err = 0; } } spin_unlock(&po->bind_lock); if (err && !refcount_read(&match->sk_ref)) { list_del(&match->list); kvfree(match); } out: kfree(rollover); mutex_unlock(&fanout_mutex); return err; } /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout. * It is the responsibility of the caller to call fanout_release_data() and * free the returned packet_fanout (after synchronize_net()) */ static struct packet_fanout *fanout_release(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f; mutex_lock(&fanout_mutex); f = po->fanout; if (f) { po->fanout = NULL; if (refcount_dec_and_test(&f->sk_ref)) list_del(&f->list); else f = NULL; } mutex_unlock(&fanout_mutex); return f; } static bool packet_extra_vlan_len_allowed(const struct net_device *dev, struct sk_buff *skb) { /* Earlier code assumed this would be a VLAN pkt, double-check * this now that we have the actual packet in hand. We can only * do this check on Ethernet devices. */ if (unlikely(dev->type != ARPHRD_ETHER)) return false; skb_reset_mac_header(skb); return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q)); } static const struct proto_ops packet_ops; static const struct proto_ops packet_ops_spkt; static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_pkt *spkt; /* * When we registered the protocol we saved the socket in the data * field for just this event. */ sk = pt->af_packet_priv; /* * Yank back the headers [hope the device set this * right or kerboom...] * * Incoming packets have ll header pulled, * push it back. * * For outgoing ones skb->data == skb_mac_header(skb) * so that this procedure is noop. */ if (skb->pkt_type == PACKET_LOOPBACK) goto out; if (!net_eq(dev_net(dev), sock_net(sk))) goto out; skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) goto oom; /* drop any routing info */ skb_dst_drop(skb); /* drop conntrack reference */ nf_reset_ct(skb); spkt = &PACKET_SKB_CB(skb)->sa.pkt; skb_push(skb, skb->data - skb_mac_header(skb)); /* * The SOCK_PACKET socket receives _all_ frames. */ spkt->spkt_family = dev->type; strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); spkt->spkt_protocol = skb->protocol; /* * Charge the memory to the socket. This is done specifically * to prevent sockets using all the memory up. */ if (sock_queue_rcv_skb(sk, skb) == 0) return 0; out: kfree_skb(skb); oom: return 0; } static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) { int depth; if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) && sock->type == SOCK_RAW) { skb_reset_mac_header(skb); skb->protocol = dev_parse_header_protocol(skb); } /* Move network header to the right position for VLAN tagged packets */ if (likely(skb->dev->type == ARPHRD_ETHER) && eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); skb_probe_transport_header(skb); } /* * Output a raw packet to a device layer. This bypasses all the other * protocol layers and you must therefore supply it with a complete frame */ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name); struct sk_buff *skb = NULL; struct net_device *dev; struct sockcm_cookie sockc; __be16 proto = 0; int err; int extra_len = 0; /* * Get and verify the address. */ if (saddr) { if (msg->msg_namelen < sizeof(struct sockaddr)) return -EINVAL; if (msg->msg_namelen == sizeof(struct sockaddr_pkt)) proto = saddr->spkt_protocol; } else return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */ /* * Find the device first to size check it */ saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0; retry: rcu_read_lock(); dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); err = -ENODEV; if (dev == NULL) goto out_unlock; err = -ENETDOWN; if (!(dev->flags & IFF_UP)) goto out_unlock; /* * You may not queue a frame bigger than the mtu. This is the lowest level * raw protocol and you must do your own fragmentation at this level. */ if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len) goto out_unlock; if (!skb) { size_t reserved = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; rcu_read_unlock(); skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); if (skb == NULL) return -ENOBUFS; /* FIXME: Save some space for broken drivers that write a hard * header at transmission time by themselves. PPP is the notable * one here. This should really be fixed at the driver level. */ skb_reserve(skb, reserved); skb_reset_network_header(skb); /* Try to align data part correctly */ if (hhlen) { skb->data -= hhlen; skb->tail -= hhlen; if (len < hhlen) skb_reset_network_header(skb); } err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err) goto out_free; goto retry; } if (!dev_validate_header(dev, skb->data, len) || !skb->len) { err = -EINVAL; goto out_unlock; } if (len > (dev->mtu + dev->hard_header_len + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_unlock; } sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto out_unlock; } skb->protocol = proto; skb->dev = dev; skb->priority = READ_ONCE(sk->sk_priority); skb->mark = READ_ONCE(sk->sk_mark); skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, sockc.tsflags); if (unlikely(extra_len == 4)) skb->no_fcs = 1; packet_parse_headers(skb, sock); dev_queue_xmit(skb); rcu_read_unlock(); return len; out_unlock: rcu_read_unlock(); out_free: kfree_skb(skb); return err; } static unsigned int run_filter(struct sk_buff *skb, const struct sock *sk, unsigned int res) { struct sk_filter *filter; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) res = bpf_prog_run_clear_cb(filter->prog, skb); rcu_read_unlock(); return res; } static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, size_t *len, int vnet_hdr_sz) { struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 }; if (*len < vnet_hdr_sz) return -EINVAL; *len -= vnet_hdr_sz; if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0)) return -EINVAL; return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz); } /* * This function makes lazy skb cloning in hope that most of packets * are discarded by BPF. * * Note tricky part: we DO mangle shared skb! skb->data, skb->len * and skb->cb are mangled. It works because (and until) packets * falling here are owned by current CPU. Output packets are cloned * by dev_queue_xmit_nit(), input packets are processed by net_bh * sequentially, so that if we return skb to original state on exit, * we will not harm anyone. */ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason = SKB_CONSUMED; struct sock *sk; struct sockaddr_ll *sll; struct packet_sock *po; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; skb->dev = dev; if (dev_has_header(dev)) { /* The device has an explicit notion of ll header, * exported to higher levels. * * Otherwise, the device hides details of its frame * structure, so that corresponding packet head is * never delivered to user. */ if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } snaplen = skb->len; res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; if (snaplen > res) snaplen = res; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto drop_n_acct; if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); if (nskb == NULL) goto drop_n_acct; if (skb_head != skb->data) { skb->data = skb_head; skb->len = skb_len; } consume_skb(skb); skb = nskb; } sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8); sll = &PACKET_SKB_CB(skb)->sa.ll; sll->sll_hatype = dev->type; sll->sll_pkttype = skb->pkt_type; if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; sll->sll_halen = dev_parse_header(skb, sll->sll_addr); /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg(). * Use their space for storing the original skb length. */ PACKET_SKB_CB(skb)->sa.origlen = skb->len; if (pskb_trim(skb, snaplen)) goto drop_n_acct; skb_set_owner_r(skb, sk); skb->dev = NULL; skb_dst_drop(skb); /* drop conntrack reference */ nf_reset_ct(skb); spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; sock_skb_set_dropcount(sk, skb); skb_clear_delivery_time(skb); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); return 0; drop_n_acct: atomic_inc(&po->tp_drops); atomic_inc(&sk->sk_drops); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: kfree_skb_reason(skb, drop_reason); return 0; } static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason = SKB_CONSUMED; struct sock *sk; struct packet_sock *po; struct sockaddr_ll *sll; union tpacket_uhdr h; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; unsigned long status = TP_STATUS_USER; unsigned short macoff, hdrlen; unsigned int netoff; struct sk_buff *copy_skb = NULL; struct timespec64 ts; __u32 ts_status; unsigned int slot_id = 0; int vnet_hdr_sz = 0; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. * We may add members to them until current aligned size without forcing * userspace to call getsockopt(..., PACKET_HDRLEN, ...). */ BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32); BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48); if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; if (dev_has_header(dev)) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } snaplen = skb->len; res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; /* If we are flooded, just give up */ if (__packet_rcv_has_room(po, skb) == ROOM_NONE) { atomic_inc(&po->tp_drops); goto drop_n_restore; } if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) status |= TP_STATUS_GSO_TCP; if (snaplen > res) snaplen = res; if (sk->sk_type == SOCK_DGRAM) { macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + po->tp_reserve; } else { unsigned int maclen = skb_network_offset(skb); netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + po->tp_reserve; vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); if (vnet_hdr_sz) netoff += vnet_hdr_sz; macoff = netoff - maclen; } if (netoff > USHRT_MAX) { atomic_inc(&po->tp_drops); goto drop_n_restore; } if (po->tp_version <= TPACKET_V2) { if (macoff + snaplen > po->rx_ring.frame_size) { if (READ_ONCE(po->copy_thresh) && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { if (skb_shared(skb)) { copy_skb = skb_clone(skb, GFP_ATOMIC); } else { copy_skb = skb_get(skb); skb_head = skb->data; } if (copy_skb) { memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0, sizeof(PACKET_SKB_CB(copy_skb)->sa.ll)); skb_set_owner_r(copy_skb, sk); } } snaplen = po->rx_ring.frame_size - macoff; if ((int)snaplen < 0) { snaplen = 0; vnet_hdr_sz = 0; } } } else if (unlikely(macoff + snaplen > GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { u32 nval; nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff; pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n", snaplen, nval, macoff); snaplen = nval; if (unlikely((int)snaplen < 0)) { snaplen = 0; macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; vnet_hdr_sz = 0; } } spin_lock(&sk->sk_receive_queue.lock); h.raw = packet_current_rx_frame(po, skb, TP_STATUS_KERNEL, (macoff+snaplen)); if (!h.raw) goto drop_n_account; if (po->tp_version <= TPACKET_V2) { slot_id = po->rx_ring.head; if (test_bit(slot_id, po->rx_ring.rx_owner_map)) goto drop_n_account; __set_bit(slot_id, po->rx_ring.rx_owner_map); } if (vnet_hdr_sz && virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), vio_le(), true, 0)) { if (po->tp_version == TPACKET_V3) prb_clear_blk_fill_status(&po->rx_ring); goto drop_n_account; } if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); /* * LOSING will be reported till you read the stats, * because it's COR - Clear On Read. * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level. */ if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; } po->stats.stats1.tp_packets++; if (copy_skb) { status |= TP_STATUS_COPY; skb_clear_delivery_time(copy_skb); __skb_queue_tail(&sk->sk_receive_queue, copy_skb); } spin_unlock(&sk->sk_receive_queue.lock); skb_copy_bits(skb, 0, h.raw + macoff, snaplen); /* Always timestamp; prefer an existing software timestamp taken * closer to the time of capture. */ ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp) | SOF_TIMESTAMPING_SOFTWARE); if (!ts_status) ktime_get_real_ts64(&ts); status |= ts_status; switch (po->tp_version) { case TPACKET_V1: h.h1->tp_len = skb->len; h.h1->tp_snaplen = snaplen; h.h1->tp_mac = macoff; h.h1->tp_net = netoff; h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; hdrlen = sizeof(*h.h1); break; case TPACKET_V2: h.h2->tp_len = skb->len; h.h2->tp_snaplen = snaplen; h.h2->tp_mac = macoff; h.h2->tp_net = netoff; h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; if (skb_vlan_tag_present(skb)) { h.h2->tp_vlan_tci = skb_vlan_tag_get(skb); h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { h.h2->tp_vlan_tci = 0; h.h2->tp_vlan_tpid = 0; } memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding)); hdrlen = sizeof(*h.h2); break; case TPACKET_V3: /* tp_nxt_offset,vlan are already populated above. * So DONT clear those fields here */ h.h3->tp_status |= status; h.h3->tp_len = skb->len; h.h3->tp_snaplen = snaplen; h.h3->tp_mac = macoff; h.h3->tp_net = netoff; h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding)); hdrlen = sizeof(*h.h3); break; default: BUG(); } sll = h.raw + TPACKET_ALIGN(hdrlen); sll->sll_halen = dev_parse_header(skb, sll->sll_addr); sll->sll_family = AF_PACKET; sll->sll_hatype = dev->type; sll->sll_protocol = skb->protocol; sll->sll_pkttype = skb->pkt_type; if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; smp_mb(); #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 if (po->tp_version <= TPACKET_V2) { u8 *start, *end; end = (u8 *) PAGE_ALIGN((unsigned long) h.raw + macoff + snaplen); for (start = h.raw; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); } smp_wmb(); #endif if (po->tp_version <= TPACKET_V2) { spin_lock(&sk->sk_receive_queue.lock); __packet_set_status(po, h.raw, status); __clear_bit(slot_id, po->rx_ring.rx_owner_map); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); } else if (po->tp_version == TPACKET_V3) { prb_clear_blk_fill_status(&po->rx_ring); } drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: kfree_skb_reason(skb, drop_reason); return 0; drop_n_account: spin_unlock(&sk->sk_receive_queue.lock); atomic_inc(&po->tp_drops); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; sk->sk_data_ready(sk); kfree_skb_reason(copy_skb, drop_reason); goto drop_n_restore; } static void tpacket_destruct_skb(struct sk_buff *skb) { struct packet_sock *po = pkt_sk(skb->sk); if (likely(po->tx_ring.pg_vec)) { void *ph; __u32 ts; ph = skb_zcopy_get_nouarg(skb); packet_dec_pending(&po->tx_ring); ts = __packet_set_timestamp(po, ph, skb); __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); complete(&po->skb_completion); } sock_wfree(skb); } static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) { if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 > __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len))) vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(), __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2); if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len) return -EINVAL; return 0; } static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz) { int ret; if (*len < vnet_hdr_sz) return -EINVAL; *len -= vnet_hdr_sz; if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter)) return -EFAULT; ret = __packet_snd_vnet_parse(vnet_hdr, *len); if (ret) return ret; /* move iter to point to the start of mac header */ if (vnet_hdr_sz != sizeof(struct virtio_net_hdr)) iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr)); return 0; } static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, void *frame, struct net_device *dev, void *data, int tp_len, __be16 proto, unsigned char *addr, int hlen, int copylen, const struct sockcm_cookie *sockc) { union tpacket_uhdr ph; int to_write, offset, len, nr_frags, len_max; struct socket *sock = po->sk.sk_socket; struct page *page; int err; ph.raw = frame; skb->protocol = proto; skb->dev = dev; skb->priority = READ_ONCE(po->sk.sk_priority); skb->mark = READ_ONCE(po->sk.sk_mark); skb->tstamp = sockc->transmit_time; skb_setup_tx_timestamp(skb, sockc->tsflags); skb_zcopy_set_nouarg(skb, ph.raw); skb_reserve(skb, hlen); skb_reset_network_header(skb); to_write = tp_len; if (sock->type == SOCK_DGRAM) { err = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, tp_len); if (unlikely(err < 0)) return -EINVAL; } else if (copylen) { int hdrlen = min_t(int, copylen, tp_len); skb_push(skb, dev->hard_header_len); skb_put(skb, copylen - dev->hard_header_len); err = skb_store_bits(skb, 0, data, hdrlen); if (unlikely(err)) return err; if (!dev_validate_header(dev, skb->data, hdrlen)) return -EINVAL; data += hdrlen; to_write -= hdrlen; } offset = offset_in_page(data); len_max = PAGE_SIZE - offset; len = ((to_write > len_max) ? len_max : to_write); skb->data_len = to_write; skb->len += to_write; skb->truesize += to_write; refcount_add(to_write, &po->sk.sk_wmem_alloc); while (likely(to_write)) { nr_frags = skb_shinfo(skb)->nr_frags; if (unlikely(nr_frags >= MAX_SKB_FRAGS)) { pr_err("Packet exceed the number of skb frags(%u)\n", (unsigned int)MAX_SKB_FRAGS); return -EFAULT; } page = pgv_to_page(data); data += len; flush_dcache_page(page); get_page(page); skb_fill_page_desc(skb, nr_frags, page, offset, len); to_write -= len; offset = 0; len_max = PAGE_SIZE; len = ((to_write > len_max) ? len_max : to_write); } packet_parse_headers(skb, sock); return tp_len; } static int tpacket_parse_header(struct packet_sock *po, void *frame, int size_max, void **data) { union tpacket_uhdr ph; int tp_len, off; ph.raw = frame; switch (po->tp_version) { case TPACKET_V3: if (ph.h3->tp_next_offset != 0) { pr_warn_once("variable sized slot not supported"); return -EINVAL; } tp_len = ph.h3->tp_len; break; case TPACKET_V2: tp_len = ph.h2->tp_len; break; default: tp_len = ph.h1->tp_len; break; } if (unlikely(tp_len > size_max)) { pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); return -EMSGSIZE; } if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) { int off_min, off_max; off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); off_max = po->tx_ring.frame_size - tp_len; if (po->sk.sk_type == SOCK_DGRAM) { switch (po->tp_version) { case TPACKET_V3: off = ph.h3->tp_net; break; case TPACKET_V2: off = ph.h2->tp_net; break; default: off = ph.h1->tp_net; break; } } else { switch (po->tp_version) { case TPACKET_V3: off = ph.h3->tp_mac; break; case TPACKET_V2: off = ph.h2->tp_mac; break; default: off = ph.h1->tp_mac; break; } } if (unlikely((off < off_min) || (off_max < off))) return -EINVAL; } else { off = po->tp_hdrlen - sizeof(struct sockaddr_ll); } *data = frame + off; return tp_len; } static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) { struct sk_buff *skb = NULL; struct net_device *dev; struct virtio_net_hdr *vnet_hdr = NULL; struct sockcm_cookie sockc; __be16 proto; int err, reserve = 0; void *ph; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); unsigned char *addr = NULL; int tp_len, size_max; void *data; int len_sum = 0; int status = TP_STATUS_AVAILABLE; int hlen, tlen, copylen = 0; long timeo = 0; mutex_lock(&po->pg_vec_lock); /* packet_sendmsg() check on tx_ring.pg_vec was lockless, * we need to confirm it under protection of pg_vec_lock. */ if (unlikely(!po->tx_ring.pg_vec)) { err = -EBUSY; goto out; } if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); if (po->sk.sk_socket->type == SOCK_DGRAM) { if (dev && msg->msg_namelen < dev->addr_len + offsetof(struct sockaddr_ll, sll_addr)) goto out_put; addr = saddr->sll_addr; } } err = -ENXIO; if (unlikely(dev == NULL)) goto out; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_put; sockcm_init(&sockc, &po->sk); if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); if (unlikely(err)) goto out_put; } if (po->sk.sk_socket->type == SOCK_RAW) reserve = dev->hard_header_len; size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz) size_max = dev->mtu + reserve + VLAN_HLEN; reinit_completion(&po->skb_completion); do { ph = packet_current_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST); if (unlikely(ph == NULL)) { if (need_wait && skb) { timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT); timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo); if (timeo <= 0) { err = !timeo ? -ETIMEDOUT : -ERESTARTSYS; goto out_put; } } /* check for additional frames */ continue; } skb = NULL; tp_len = tpacket_parse_header(po, ph, size_max, &data); if (tp_len < 0) goto tpacket_error; status = TP_STATUS_SEND_REQUEST; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; if (vnet_hdr_sz) { vnet_hdr = data; data += vnet_hdr_sz; tp_len -= vnet_hdr_sz; if (tp_len < 0 || __packet_snd_vnet_parse(vnet_hdr, tp_len)) { tp_len = -EINVAL; goto tpacket_error; } copylen = __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len); } copylen = max_t(int, copylen, dev->hard_header_len); skb = sock_alloc_send_skb(&po->sk, hlen + tlen + sizeof(struct sockaddr_ll) + (copylen - dev->hard_header_len), !need_wait, &err); if (unlikely(skb == NULL)) { /* we assume the socket was initially writeable ... */ if (likely(len_sum > 0)) err = len_sum; goto out_status; } tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto, addr, hlen, copylen, &sockc); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && !vnet_hdr_sz && !packet_extra_vlan_len_allowed(dev, skb)) tp_len = -EMSGSIZE; if (unlikely(tp_len < 0)) { tpacket_error: if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) { __packet_set_status(po, ph, TP_STATUS_AVAILABLE); packet_increment_head(&po->tx_ring); kfree_skb(skb); continue; } else { status = TP_STATUS_WRONG_FORMAT; err = tp_len; goto out_status; } } if (vnet_hdr_sz) { if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { tp_len = -EINVAL; goto tpacket_error; } virtio_net_hdr_set_proto(skb, vnet_hdr); } skb->destructor = tpacket_destruct_skb; __packet_set_status(po, ph, TP_STATUS_SENDING); packet_inc_pending(&po->tx_ring); status = TP_STATUS_SEND_REQUEST; err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); if (err && __packet_get_status(po, ph) == TP_STATUS_AVAILABLE) { /* skb was destructed already */ skb = NULL; goto out_status; } /* * skb was dropped but not destructed yet; * let's treat it like congestion or err < 0 */ err = 0; } packet_increment_head(&po->tx_ring); len_sum += tp_len; } while (likely((ph != NULL) || /* Note: packet_read_pending() might be slow if we have * to call it as it's per_cpu variable, but in fast-path * we already short-circuit the loop with the first * condition, and luckily don't have to go that path * anyway. */ (need_wait && packet_read_pending(&po->tx_ring)))); err = len_sum; goto out_put; out_status: __packet_set_status(po, ph, status); kfree_skb(skb); out_put: dev_put(dev); out: mutex_unlock(&po->pg_vec_lock); return err; } static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, size_t reserve, size_t len, size_t linear, int noblock, int *err) { struct sk_buff *skb; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE || !linear) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return NULL; skb_reserve(skb, reserve); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); struct sk_buff *skb; struct net_device *dev; __be16 proto; unsigned char *addr = NULL; int err, reserve = 0; struct sockcm_cookie sockc; struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); int hlen, tlen, linear; int extra_len = 0; /* * Get and verify the address. */ if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); if (sock->type == SOCK_DGRAM) { if (dev && msg->msg_namelen < dev->addr_len + offsetof(struct sockaddr_ll, sll_addr)) goto out_unlock; addr = saddr->sll_addr; } } err = -ENXIO; if (unlikely(dev == NULL)) goto out_unlock; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; sockcm_init(&sockc, sk); sockc.mark = READ_ONCE(sk->sk_mark); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto out_unlock; } if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; if (vnet_hdr_sz) { err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz); if (err) goto out_unlock; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) goto out_unlock; err = -ENOBUFS; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len); linear = max(linear, min_t(int, len, dev->hard_header_len)); skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear, msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; skb_reset_network_header(skb); err = -EINVAL; if (sock->type == SOCK_DGRAM) { offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len); if (unlikely(offset < 0)) goto out_free; } else if (reserve) { skb_reserve(skb, -reserve); if (len < reserve + sizeof(struct ipv6hdr) && dev->min_header_len != dev->hard_header_len) skb_reset_network_header(skb); } /* Returns -EFAULT on error */ err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len); if (err) goto out_free; if ((sock->type == SOCK_RAW && !dev_validate_header(dev, skb->data, len)) || !skb->len) { err = -EINVAL; goto out_free; } skb_setup_tx_timestamp(skb, sockc.tsflags); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_free; } skb->protocol = proto; skb->dev = dev; skb->priority = READ_ONCE(sk->sk_priority); skb->mark = sockc.mark; skb->tstamp = sockc.transmit_time; if (unlikely(extra_len == 4)) skb->no_fcs = 1; packet_parse_headers(skb, sock); if (vnet_hdr_sz) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; len += vnet_hdr_sz; virtio_net_hdr_set_proto(skb, &vnet_hdr); } err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); if (err) goto out_unlock; } dev_put(dev); return len; out_free: kfree_skb(skb); out_unlock: dev_put(dev); out: return err; } static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy. * tpacket_snd() will redo the check safely. */ if (data_race(po->tx_ring.pg_vec)) return tpacket_snd(po, msg); return packet_snd(sock, msg, len); } /* * Close a PACKET socket. This is fairly simple. We immediately go * to 'closed' state and remove our protocol entry in the device list. */ static int packet_release(struct socket *sock) { struct sock *sk = sock->sk; struct packet_sock *po; struct packet_fanout *f; struct net *net; union tpacket_req_u req_u; if (!sk) return 0; net = sock_net(sk); po = pkt_sk(sk); mutex_lock(&net->packet.sklist_lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->packet.sklist_lock); sock_prot_inuse_add(net, sk->sk_prot, -1); spin_lock(&po->bind_lock); unregister_prot_hook(sk, false); packet_cached_dev_reset(po); if (po->prot_hook.dev) { netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); packet_flush_mclist(sk); lock_sock(sk); if (po->rx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 0); } if (po->tx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 1); } release_sock(sk); f = fanout_release(sk); synchronize_net(); kfree(po->rollover); if (f) { fanout_release_data(f); kvfree(f); } /* * Now the socket is dead. No more input will appear. */ sock_orphan(sk); sock->sk = NULL; /* Purge queues */ skb_queue_purge(&sk->sk_receive_queue); packet_free_pending(po); sock_put(sk); return 0; } /* * Attach a packet hook. */ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, __be16 proto) { struct packet_sock *po = pkt_sk(sk); struct net_device *dev = NULL; bool unlisted = false; bool need_rehook; int ret = 0; lock_sock(sk); spin_lock(&po->bind_lock); if (!proto) proto = po->num; rcu_read_lock(); if (po->fanout) { ret = -EINVAL; goto out_unlock; } if (name) { dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) { ret = -ENODEV; goto out_unlock; } } else if (ifindex) { dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (!dev) { ret = -ENODEV; goto out_unlock; } } need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev; if (need_rehook) { dev_hold(dev); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { rcu_read_unlock(); /* prevents packet_notifier() from calling * register_prot_hook() */ WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, true); rcu_read_lock(); if (dev) unlisted = !dev_get_by_index_rcu(sock_net(sk), dev->ifindex); } BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING)); WRITE_ONCE(po->num, proto); po->prot_hook.type = proto; netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); if (unlikely(unlisted)) { po->prot_hook.dev = NULL; WRITE_ONCE(po->ifindex, -1); packet_cached_dev_reset(po); } else { netdev_hold(dev, &po->prot_hook.dev_tracker, GFP_ATOMIC); po->prot_hook.dev = dev; WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0); packet_cached_dev_assign(po, dev); } dev_put(dev); } if (proto == 0 || !need_rehook) goto out_unlock; if (!unlisted && (!dev || (dev->flags & IFF_UP))) { register_prot_hook(sk); } else { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } out_unlock: rcu_read_unlock(); spin_unlock(&po->bind_lock); release_sock(sk); return ret; } /* * Bind a packet socket to a device */ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; char name[sizeof(uaddr->sa_data_min) + 1]; /* * Check legality */ if (addr_len != sizeof(struct sockaddr)) return -EINVAL; /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); name[sizeof(uaddr->sa_data_min)] = 0; return packet_do_bind(sk, name, 0, 0); } static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; struct sock *sk = sock->sk; /* * Check legality */ if (addr_len < sizeof(struct sockaddr_ll)) return -EINVAL; if (sll->sll_family != AF_PACKET) return -EINVAL; return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol); } static struct proto packet_proto = { .name = "PACKET", .owner = THIS_MODULE, .obj_size = sizeof(struct packet_sock), }; /* * Create a packet of type SOCK_PACKET. */ static int packet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct packet_sock *po; __be16 proto = (__force __be16)protocol; /* weird, but documented */ int err; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && sock->type != SOCK_PACKET) return -ESOCKTNOSUPPORT; sock->state = SS_UNCONNECTED; err = -ENOBUFS; sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern); if (sk == NULL) goto out; sock->ops = &packet_ops; if (sock->type == SOCK_PACKET) sock->ops = &packet_ops_spkt; sock_init_data(sock, sk); po = pkt_sk(sk); init_completion(&po->skb_completion); sk->sk_family = PF_PACKET; po->num = proto; err = packet_alloc_pending(po); if (err) goto out2; packet_cached_dev_reset(po); sk->sk_destruct = packet_sock_destruct; /* * Attach a protocol block */ spin_lock_init(&po->bind_lock); mutex_init(&po->pg_vec_lock); po->rollover = NULL; po->prot_hook.func = packet_rcv; if (sock->type == SOCK_PACKET) po->prot_hook.func = packet_rcv_spkt; po->prot_hook.af_packet_priv = sk; po->prot_hook.af_packet_net = sock_net(sk); if (proto) { po->prot_hook.type = proto; __register_prot_hook(sk); } mutex_lock(&net->packet.sklist_lock); sk_add_node_tail_rcu(sk, &net->packet.sklist); mutex_unlock(&net->packet.sklist_lock); sock_prot_inuse_add(net, &packet_proto, 1); return 0; out2: sk_free(sk); out: return err; } /* * Pull a packet from our receive queue and hand it to the user. * If necessary we block. */ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz); unsigned int origlen = 0; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE)) goto out; #if 0 /* What error should we return now? EUNATTACH? */ if (pkt_sk(sk)->ifindex < 0) return -ENODEV; #endif if (flags & MSG_ERRQUEUE) { err = sock_recv_errqueue(sk, msg, len, SOL_PACKET, PACKET_TX_TIMESTAMP); goto out; } /* * Call the generic datagram receiver. This handles all sorts * of horrible races and re-entrancy so we can forget about it * in the protocol layers. * * Now it will return ENETDOWN, if device have just gone down, * but then it will block. */ skb = skb_recv_datagram(sk, flags, &err); /* * An error occurred so return it. Because skb_recv_datagram() * handles the blocking we don't see and worry about blocking * retries. */ if (skb == NULL) goto out; packet_rcv_try_clear_pressure(pkt_sk(sk)); if (vnet_hdr_len) { err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len); if (err) goto out_free; } /* You lose any data beyond the buffer you gave. If it worries * a user program they can ask the device for its MTU * anyway. */ copied = skb->len; if (copied > len) { copied = len; msg->msg_flags |= MSG_TRUNC; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; if (sock->type != SOCK_PACKET) { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; /* Original length was stored in sockaddr_ll fields */ origlen = PACKET_SKB_CB(skb)->sa.origlen; sll->sll_family = AF_PACKET; sll->sll_protocol = skb->protocol; } sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { const size_t max_len = min(sizeof(skb->cb), sizeof(struct sockaddr_storage)); int copy_len; /* If the address length field is there to be filled * in, we fill it in now. */ if (sock->type == SOCK_PACKET) { __sockaddr_check_size(sizeof(struct sockaddr_pkt)); msg->msg_namelen = sizeof(struct sockaddr_pkt); copy_len = msg->msg_namelen; } else { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); copy_len = msg->msg_namelen; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) { memset(msg->msg_name + offsetof(struct sockaddr_ll, sll_addr), 0, sizeof(sll->sll_addr)); msg->msg_namelen = sizeof(struct sockaddr_ll); } } if (WARN_ON_ONCE(copy_len > max_len)) { copy_len = max_len; msg->msg_namelen = copy_len; } memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) { struct tpacket_auxdata aux; aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) aux.tp_status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) aux.tp_status |= TP_STATUS_GSO_TCP; aux.tp_len = origlen; aux.tp_snaplen = skb->len; aux.tp_mac = 0; aux.tp_net = skb_network_offset(skb); if (skb_vlan_tag_present(skb)) { aux.tp_vlan_tci = skb_vlan_tag_get(skb); aux.tp_vlan_tpid = ntohs(skb->vlan_proto); aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { aux.tp_vlan_tci = 0; aux.tp_vlan_tpid = 0; } put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); } /* * Free or return the buffer as appropriate. Again this * hides all the races and re-entrancy issues from us. */ err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); out_free: skb_free_datagram(sk, skb); out: return err; } static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, int peer) { struct net_device *dev; struct sock *sk = sock->sk; if (peer) return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min)); rcu_read_unlock(); return sizeof(*uaddr); } static int packet_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct net_device *dev; struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); int ifindex; if (peer) return -EOPNOTSUPP; ifindex = READ_ONCE(po->ifindex); sll->sll_family = AF_PACKET; sll->sll_ifindex = ifindex; sll->sll_protocol = READ_ONCE(po->num); sll->sll_pkttype = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; /* Let __fortify_memcpy_chk() know the actual buffer size. */ memcpy(((struct sockaddr_storage *)sll)->__data + offsetof(struct sockaddr_ll, sll_addr) - offsetofend(struct sockaddr_ll, sll_family), dev->dev_addr, dev->addr_len); } else { sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ sll->sll_halen = 0; } rcu_read_unlock(); return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; } static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what) { switch (i->type) { case PACKET_MR_MULTICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_mc_add(dev, i->addr); else return dev_mc_del(dev, i->addr); break; case PACKET_MR_PROMISC: return dev_set_promiscuity(dev, what); case PACKET_MR_ALLMULTI: return dev_set_allmulti(dev, what); case PACKET_MR_UNICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_uc_add(dev, i->addr); else return dev_uc_del(dev, i->addr); break; default: break; } return 0; } static void packet_dev_mclist_delete(struct net_device *dev, struct packet_mclist **mlp) { struct packet_mclist *ml; while ((ml = *mlp) != NULL) { if (ml->ifindex == dev->ifindex) { packet_dev_mc(dev, ml, -1); *mlp = ml->next; kfree(ml); } else mlp = &ml->next; } } static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_sock *po = pkt_sk(sk); struct packet_mclist *ml, *i; struct net_device *dev; int err; rtnl_lock(); err = -ENODEV; dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex); if (!dev) goto done; err = -EINVAL; if (mreq->mr_alen > dev->addr_len) goto done; err = -ENOBUFS; i = kmalloc(sizeof(*i), GFP_KERNEL); if (i == NULL) goto done; err = 0; for (ml = po->mclist; ml; ml = ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { ml->count++; /* Free the new element ... */ kfree(i); goto done; } } i->type = mreq->mr_type; i->ifindex = mreq->mr_ifindex; i->alen = mreq->mr_alen; memcpy(i->addr, mreq->mr_address, i->alen); memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen); i->count = 1; i->next = po->mclist; po->mclist = i; err = packet_dev_mc(dev, i, 1); if (err) { po->mclist = i->next; kfree(i); } done: rtnl_unlock(); return err; } static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_mclist *ml, **mlp; rtnl_lock(); for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { if (--ml->count == 0) { struct net_device *dev; *mlp = ml->next; dev = __dev_get_by_index(sock_net(sk), ml->ifindex); if (dev) packet_dev_mc(dev, ml, -1); kfree(ml); } break; } } rtnl_unlock(); return 0; } static void packet_flush_mclist(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_mclist *ml; if (!po->mclist) return; rtnl_lock(); while ((ml = po->mclist) != NULL) { struct net_device *dev; po->mclist = ml->next; dev = __dev_get_by_index(sock_net(sk), ml->ifindex); if (dev != NULL) packet_dev_mc(dev, ml, -1); kfree(ml); } rtnl_unlock(); } static int packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); int ret; if (level != SOL_PACKET) return -ENOPROTOOPT; switch (optname) { case PACKET_ADD_MEMBERSHIP: case PACKET_DROP_MEMBERSHIP: { struct packet_mreq_max mreq; int len = optlen; memset(&mreq, 0, sizeof(mreq)); if (len < sizeof(struct packet_mreq)) return -EINVAL; if (len > sizeof(mreq)) len = sizeof(mreq); if (copy_from_sockptr(&mreq, optval, len)) return -EFAULT; if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) return -EINVAL; if (optname == PACKET_ADD_MEMBERSHIP) ret = packet_mc_add(sk, &mreq); else ret = packet_mc_drop(sk, &mreq); return ret; } case PACKET_RX_RING: case PACKET_TX_RING: { union tpacket_req_u req_u; ret = -EINVAL; lock_sock(sk); switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: if (optlen < sizeof(req_u.req)) break; ret = copy_from_sockptr(&req_u.req, optval, sizeof(req_u.req)) ? -EINVAL : 0; break; case TPACKET_V3: default: if (optlen < sizeof(req_u.req3)) break; ret = copy_from_sockptr(&req_u.req3, optval, sizeof(req_u.req3)) ? -EINVAL : 0; break; } if (!ret) ret = packet_set_ring(sk, &req_u, 0, optname == PACKET_TX_RING); release_sock(sk); return ret; } case PACKET_COPY_THRESH: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; WRITE_ONCE(pkt_sk(sk)->copy_thresh, val); return 0; } case PACKET_VERSION: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (val) { case TPACKET_V1: case TPACKET_V2: case TPACKET_V3: break; default: return -EINVAL; } lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { po->tp_version = val; ret = 0; } release_sock(sk); return ret; } case PACKET_RESERVE: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val > INT_MAX) return -EINVAL; lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { po->tp_reserve = val; ret = 0; } release_sock(sk); return ret; } case PACKET_LOSS: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { packet_sock_flag_set(po, PACKET_SOCK_TP_LOSS, val); ret = 0; } release_sock(sk); return ret; } case PACKET_AUXDATA: { int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val); return 0; } case PACKET_ORIGDEV: { int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val); return 0; } case PACKET_VNET_HDR: case PACKET_VNET_HDR_SZ: { int val, hdr_len; if (sock->type != SOCK_RAW) return -EINVAL; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (optname == PACKET_VNET_HDR_SZ) { if (val && val != sizeof(struct virtio_net_hdr) && val != sizeof(struct virtio_net_hdr_mrg_rxbuf)) return -EINVAL; hdr_len = val; } else { hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; } lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { WRITE_ONCE(po->vnet_hdr_sz, hdr_len); ret = 0; } release_sock(sk); return ret; } case PACKET_TIMESTAMP: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; WRITE_ONCE(po->tp_tstamp, val); return 0; } case PACKET_FANOUT: { struct fanout_args args = { 0 }; if (optlen != sizeof(int) && optlen != sizeof(args)) return -EINVAL; if (copy_from_sockptr(&args, optval, optlen)) return -EFAULT; return fanout_add(sk, &args); } case PACKET_FANOUT_DATA: { /* Paired with the WRITE_ONCE() in fanout_add() */ if (!READ_ONCE(po->fanout)) return -EINVAL; return fanout_set_data(po, optval, optlen); } case PACKET_IGNORE_OUTGOING: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val < 0 || val > 1) return -EINVAL; WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val); return 0; } case PACKET_TX_HAS_OFF: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec) packet_sock_flag_set(po, PACKET_SOCK_TX_HAS_OFF, val); release_sock(sk); return 0; } case PACKET_QDISC_BYPASS: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_QDISC_BYPASS, val); return 0; } default: return -ENOPROTOOPT; } } static int packet_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { int len; int val, lv = sizeof(val); struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); void *data = &val; union tpacket_stats_u st; struct tpacket_rollover_stats rstats; int drops; if (level != SOL_PACKET) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case PACKET_STATISTICS: spin_lock_bh(&sk->sk_receive_queue.lock); memcpy(&st, &po->stats, sizeof(st)); memset(&po->stats, 0, sizeof(po->stats)); spin_unlock_bh(&sk->sk_receive_queue.lock); drops = atomic_xchg(&po->tp_drops, 0); if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); st.stats3.tp_drops = drops; st.stats3.tp_packets += drops; data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); st.stats1.tp_drops = drops; st.stats1.tp_packets += drops; data = &st.stats1; } break; case PACKET_AUXDATA: val = packet_sock_flag(po, PACKET_SOCK_AUXDATA); break; case PACKET_ORIGDEV: val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); break; case PACKET_VNET_HDR: val = !!READ_ONCE(po->vnet_hdr_sz); break; case PACKET_VNET_HDR_SZ: val = READ_ONCE(po->vnet_hdr_sz); break; case PACKET_COPY_THRESH: val = READ_ONCE(pkt_sk(sk)->copy_thresh); break; case PACKET_VERSION: val = po->tp_version; break; case PACKET_HDRLEN: if (len > sizeof(int)) len = sizeof(int); if (len < sizeof(int)) return -EINVAL; if (copy_from_user(&val, optval, len)) return -EFAULT; switch (val) { case TPACKET_V1: val = sizeof(struct tpacket_hdr); break; case TPACKET_V2: val = sizeof(struct tpacket2_hdr); break; case TPACKET_V3: val = sizeof(struct tpacket3_hdr); break; default: return -EINVAL; } break; case PACKET_RESERVE: val = po->tp_reserve; break; case PACKET_LOSS: val = packet_sock_flag(po, PACKET_SOCK_TP_LOSS); break; case PACKET_TIMESTAMP: val = READ_ONCE(po->tp_tstamp); break; case PACKET_FANOUT: val = (po->fanout ? ((u32)po->fanout->id | ((u32)po->fanout->type << 16) | ((u32)po->fanout->flags << 24)) : 0); break; case PACKET_IGNORE_OUTGOING: val = READ_ONCE(po->prot_hook.ignore_outgoing); break; case PACKET_ROLLOVER_STATS: if (!po->rollover) return -EINVAL; rstats.tp_all = atomic_long_read(&po->rollover->num); rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); data = &rstats; lv = sizeof(rstats); break; case PACKET_TX_HAS_OFF: val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF); break; case PACKET_QDISC_BYPASS: val = packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS); break; default: return -ENOPROTOOPT; } if (len > lv) len = lv; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, data, len)) return -EFAULT; return 0; } static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { struct sock *sk; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); rcu_read_lock(); sk_for_each_rcu(sk, &net->packet.sklist) { struct packet_sock *po = pkt_sk(sk); switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) packet_dev_mclist_delete(dev, &po->mclist); fallthrough; case NETDEV_DOWN: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { __unregister_prot_hook(sk, false); sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } if (msg == NETDEV_UNREGISTER) { packet_cached_dev_reset(po); WRITE_ONCE(po->ifindex, -1); netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); } break; case NETDEV_UP: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (po->num) register_prot_hook(sk); spin_unlock(&po->bind_lock); } break; } } rcu_read_unlock(); return NOTIFY_DONE; } static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; switch (cmd) { case SIOCOUTQ: { int amount = sk_wmem_alloc_get(sk); return put_user(amount, (int __user *)arg); } case SIOCINQ: { struct sk_buff *skb; int amount = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) amount = skb->len; spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } #ifdef CONFIG_INET case SIOCADDRT: case SIOCDELRT: case SIOCDARP: case SIOCGARP: case SIOCSARP: case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCSIFFLAGS: return inet_dgram_ops.ioctl(sock, cmd, arg); #endif default: return -ENOIOCTLCMD; } return 0; } static __poll_t packet_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); __poll_t mask = datagram_poll(file, sock, wait); spin_lock_bh(&sk->sk_receive_queue.lock); if (po->rx_ring.pg_vec) { if (!packet_previous_rx_frame(po, &po->rx_ring, TP_STATUS_KERNEL)) mask |= EPOLLIN | EPOLLRDNORM; } packet_rcv_try_clear_pressure(po); spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE)) mask |= EPOLLOUT | EPOLLWRNORM; } spin_unlock_bh(&sk->sk_write_queue.lock); return mask; } /* Dirty? Well, I still did not learn better way to account * for user mmaps. */ static void packet_mm_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct socket *sock = file->private_data; struct sock *sk = sock->sk; if (sk) atomic_long_inc(&pkt_sk(sk)->mapped); } static void packet_mm_close(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct socket *sock = file->private_data; struct sock *sk = sock->sk; if (sk) atomic_long_dec(&pkt_sk(sk)->mapped); } static const struct vm_operations_struct packet_mmap_ops = { .open = packet_mm_open, .close = packet_mm_close, }; static void free_pg_vec(struct pgv *pg_vec, unsigned int order, unsigned int len) { int i; for (i = 0; i < len; i++) { if (likely(pg_vec[i].buffer)) { if (is_vmalloc_addr(pg_vec[i].buffer)) vfree(pg_vec[i].buffer); else free_pages((unsigned long)pg_vec[i].buffer, order); pg_vec[i].buffer = NULL; } } kfree(pg_vec); } static char *alloc_one_pg_vec_page(unsigned long order) { char *buffer; gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; buffer = (char *) __get_free_pages(gfp_flags, order); if (buffer) return buffer; /* __get_free_pages failed, fall back to vmalloc */ buffer = vzalloc(array_size((1 << order), PAGE_SIZE)); if (buffer) return buffer; /* vmalloc failed, lets dig into swap here */ gfp_flags &= ~__GFP_NORETRY; buffer = (char *) __get_free_pages(gfp_flags, order); if (buffer) return buffer; /* complete and utter failure */ return NULL; } static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) { unsigned int block_nr = req->tp_block_nr; struct pgv *pg_vec; int i; pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN); if (unlikely(!pg_vec)) goto out; for (i = 0; i < block_nr; i++) { pg_vec[i].buffer = alloc_one_pg_vec_page(order); if (unlikely(!pg_vec[i].buffer)) goto out_free_pgvec; } out: return pg_vec; out_free_pgvec: free_pg_vec(pg_vec, order, block_nr); pg_vec = NULL; goto out; } static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring) { struct pgv *pg_vec = NULL; struct packet_sock *po = pkt_sk(sk); unsigned long *rx_owner_map = NULL; int was_running, order = 0; struct packet_ring_buffer *rb; struct sk_buff_head *rb_queue; __be16 num; int err; /* Added to avoid minimal code churn */ struct tpacket_req *req = &req_u->req; rb = tx_ring ? &po->tx_ring : &po->rx_ring; rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; err = -EBUSY; if (!closing) { if (atomic_long_read(&po->mapped)) goto out; if (packet_read_pending(rb)) goto out; } if (req->tp_block_nr) { unsigned int min_frame_size; /* Sanity tests and some calculations */ err = -EBUSY; if (unlikely(rb->pg_vec)) goto out; switch (po->tp_version) { case TPACKET_V1: po->tp_hdrlen = TPACKET_HDRLEN; break; case TPACKET_V2: po->tp_hdrlen = TPACKET2_HDRLEN; break; case TPACKET_V3: po->tp_hdrlen = TPACKET3_HDRLEN; break; } err = -EINVAL; if (unlikely((int)req->tp_block_size <= 0)) goto out; if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) goto out; min_frame_size = po->tp_hdrlen + po->tp_reserve; if (po->tp_version >= TPACKET_V3 && req->tp_block_size < BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size) goto out; if (unlikely(req->tp_frame_size < min_frame_size)) goto out; if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) goto out; rb->frames_per_block = req->tp_block_size / req->tp_frame_size; if (unlikely(rb->frames_per_block == 0)) goto out; if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr)) goto out; if (unlikely((rb->frames_per_block * req->tp_block_nr) != req->tp_frame_nr)) goto out; err = -ENOMEM; order = get_order(req->tp_block_size); pg_vec = alloc_pg_vec(req, order); if (unlikely(!pg_vec)) goto out; switch (po->tp_version) { case TPACKET_V3: /* Block transmit is not supported yet */ if (!tx_ring) { init_prb_bdqc(po, rb, pg_vec, req_u); } else { struct tpacket_req3 *req3 = &req_u->req3; if (req3->tp_retire_blk_tov || req3->tp_sizeof_priv || req3->tp_feature_req_word) { err = -EINVAL; goto out_free_pg_vec; } } break; default: if (!tx_ring) { rx_owner_map = bitmap_alloc(req->tp_frame_nr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); if (!rx_owner_map) goto out_free_pg_vec; } break; } } /* Done */ else { err = -EINVAL; if (unlikely(req->tp_frame_nr)) goto out; } /* Detach socket from network */ spin_lock(&po->bind_lock); was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING); num = po->num; if (was_running) { WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, false); } spin_unlock(&po->bind_lock); synchronize_net(); err = -EBUSY; mutex_lock(&po->pg_vec_lock); if (closing || atomic_long_read(&po->mapped) == 0) { err = 0; spin_lock_bh(&rb_queue->lock); swap(rb->pg_vec, pg_vec); if (po->tp_version <= TPACKET_V2) swap(rb->rx_owner_map, rx_owner_map); rb->frame_max = (req->tp_frame_nr - 1); rb->head = 0; rb->frame_size = req->tp_frame_size; spin_unlock_bh(&rb_queue->lock); swap(rb->pg_vec_order, order); swap(rb->pg_vec_len, req->tp_block_nr); rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; po->prot_hook.func = (po->rx_ring.pg_vec) ? tpacket_rcv : packet_rcv; skb_queue_purge(rb_queue); if (atomic_long_read(&po->mapped)) pr_err("packet_mmap: vma is busy: %ld\n", atomic_long_read(&po->mapped)); } mutex_unlock(&po->pg_vec_lock); spin_lock(&po->bind_lock); if (was_running) { WRITE_ONCE(po->num, num); register_prot_hook(sk); } spin_unlock(&po->bind_lock); if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ if (!tx_ring) prb_shutdown_retire_blk_timer(po, rb_queue); } out_free_pg_vec: if (pg_vec) { bitmap_free(rx_owner_map); free_pg_vec(pg_vec, order, req->tp_block_nr); } out: return err; } static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); unsigned long size, expected_size; struct packet_ring_buffer *rb; unsigned long start; int err = -EINVAL; int i; if (vma->vm_pgoff) return -EINVAL; mutex_lock(&po->pg_vec_lock); expected_size = 0; for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { if (rb->pg_vec) { expected_size += rb->pg_vec_len * rb->pg_vec_pages * PAGE_SIZE; } } if (expected_size == 0) goto out; size = vma->vm_end - vma->vm_start; if (size != expected_size) goto out; start = vma->vm_start; for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { if (rb->pg_vec == NULL) continue; for (i = 0; i < rb->pg_vec_len; i++) { struct page *page; void *kaddr = rb->pg_vec[i].buffer; int pg_num; for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) { page = pgv_to_page(kaddr); err = vm_insert_page(vma, start, page); if (unlikely(err)) goto out; start += PAGE_SIZE; kaddr += PAGE_SIZE; } } } atomic_long_inc(&po->mapped); vma->vm_ops = &packet_mmap_ops; err = 0; out: mutex_unlock(&po->pg_vec_lock); return err; } static const struct proto_ops packet_ops_spkt = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind_spkt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname_spkt, .poll = datagram_poll, .ioctl = packet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = packet_sendmsg_spkt, .recvmsg = packet_recvmsg, .mmap = sock_no_mmap, }; static const struct proto_ops packet_ops = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname, .poll = packet_poll, .ioctl = packet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = packet_setsockopt, .getsockopt = packet_getsockopt, .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, }; static const struct net_proto_family packet_family_ops = { .family = PF_PACKET, .create = packet_create, .owner = THIS_MODULE, }; static struct notifier_block packet_netdev_notifier = { .notifier_call = packet_notifier, }; #ifdef CONFIG_PROC_FS static void *packet_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); rcu_read_lock(); return seq_hlist_start_head_rcu(&net->packet.sklist, *pos); } static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); return seq_hlist_next_rcu(v, &net->packet.sklist, pos); } static void packet_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int packet_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, "%*sRefCnt Type Proto Iface R Rmem User Inode\n", IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk"); else { struct sock *s = sk_entry(v); const struct packet_sock *po = pkt_sk(s); seq_printf(seq, "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", s, refcount_read(&s->sk_refcnt), s->sk_type, ntohs(READ_ONCE(po->num)), READ_ONCE(po->ifindex), packet_sock_flag(po, PACKET_SOCK_RUNNING), atomic_read(&s->sk_rmem_alloc), from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), sock_i_ino(s)); } return 0; } static const struct seq_operations packet_seq_ops = { .start = packet_seq_start, .next = packet_seq_next, .stop = packet_seq_stop, .show = packet_seq_show, }; #endif static int __net_init packet_net_init(struct net *net) { mutex_init(&net->packet.sklist_lock); INIT_HLIST_HEAD(&net->packet.sklist); #ifdef CONFIG_PROC_FS if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; #endif /* CONFIG_PROC_FS */ return 0; } static void __net_exit packet_net_exit(struct net *net) { remove_proc_entry("packet", net->proc_net); WARN_ON_ONCE(!hlist_empty(&net->packet.sklist)); } static struct pernet_operations packet_net_ops = { .init = packet_net_init, .exit = packet_net_exit, }; static void __exit packet_exit(void) { sock_unregister(PF_PACKET); proto_unregister(&packet_proto); unregister_netdevice_notifier(&packet_netdev_notifier); unregister_pernet_subsys(&packet_net_ops); } static int __init packet_init(void) { int rc; rc = register_pernet_subsys(&packet_net_ops); if (rc) goto out; rc = register_netdevice_notifier(&packet_netdev_notifier); if (rc) goto out_pernet; rc = proto_register(&packet_proto, 0); if (rc) goto out_notifier; rc = sock_register(&packet_family_ops); if (rc) goto out_proto; return 0; out_proto: proto_unregister(&packet_proto); out_notifier: unregister_netdevice_notifier(&packet_netdev_notifier); out_pernet: unregister_pernet_subsys(&packet_net_ops); out: return rc; } module_init(packet_init); module_exit(packet_exit); MODULE_DESCRIPTION("Packet socket support (AF_PACKET)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PACKET);
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SUSPEND_H #define _LINUX_SUSPEND_H #include <linux/swap.h> #include <linux/notifier.h> #include <linux/init.h> #include <linux/pm.h> #include <linux/mm.h> #include <linux/freezer.h> #include <asm/errno.h> #ifdef CONFIG_VT extern void pm_set_vt_switch(int); #else static inline void pm_set_vt_switch(int do_switch) { } #endif #ifdef CONFIG_VT_CONSOLE_SLEEP extern void pm_prepare_console(void); extern void pm_restore_console(void); #else static inline void pm_prepare_console(void) { } static inline void pm_restore_console(void) { } #endif typedef int __bitwise suspend_state_t; #define PM_SUSPEND_ON ((__force suspend_state_t) 0) #define PM_SUSPEND_TO_IDLE ((__force suspend_state_t) 1) #define PM_SUSPEND_STANDBY ((__force suspend_state_t) 2) #define PM_SUSPEND_MEM ((__force suspend_state_t) 3) #define PM_SUSPEND_MIN PM_SUSPEND_TO_IDLE #define PM_SUSPEND_MAX ((__force suspend_state_t) 4) /** * struct platform_suspend_ops - Callbacks for managing platform dependent * system sleep states. * * @valid: Callback to determine if given system sleep state is supported by * the platform. * Valid (ie. supported) states are advertised in /sys/power/state. Note * that it still may be impossible to enter given system sleep state if the * conditions aren't right. * There is the %suspend_valid_only_mem function available that can be * assigned to this if the platform only supports mem sleep. * * @begin: Initialise a transition to given system sleep state. * @begin() is executed right prior to suspending devices. The information * conveyed to the platform code by @begin() should be disregarded by it as * soon as @end() is executed. If @begin() fails (ie. returns nonzero), * @prepare(), @enter() and @finish() will not be called by the PM core. * This callback is optional. However, if it is implemented, the argument * passed to @enter() is redundant and should be ignored. * * @prepare: Prepare the platform for entering the system sleep state indicated * by @begin(). * @prepare() is called right after devices have been suspended (ie. the * appropriate .suspend() method has been executed for each device) and * before device drivers' late suspend callbacks are executed. It returns * 0 on success or a negative error code otherwise, in which case the * system cannot enter the desired sleep state (@prepare_late(), @enter(), * and @wake() will not be called in that case). * * @prepare_late: Finish preparing the platform for entering the system sleep * state indicated by @begin(). * @prepare_late is called before disabling nonboot CPUs and after * device drivers' late suspend callbacks have been executed. It returns * 0 on success or a negative error code otherwise, in which case the * system cannot enter the desired sleep state (@enter() will not be * executed). * * @enter: Enter the system sleep state indicated by @begin() or represented by * the argument if @begin() is not implemented. * This callback is mandatory. It returns 0 on success or a negative * error code otherwise, in which case the system cannot enter the desired * sleep state. * * @wake: Called when the system has just left a sleep state, right after * the nonboot CPUs have been enabled and before device drivers' early * resume callbacks are executed. * This callback is optional, but should be implemented by the platforms * that implement @prepare_late(). If implemented, it is always called * after @prepare_late and @enter(), even if one of them fails. * * @finish: Finish wake-up of the platform. * @finish is called right prior to calling device drivers' regular suspend * callbacks. * This callback is optional, but should be implemented by the platforms * that implement @prepare(). If implemented, it is always called after * @enter() and @wake(), even if any of them fails. It is executed after * a failing @prepare. * * @suspend_again: Returns whether the system should suspend again (true) or * not (false). If the platform wants to poll sensors or execute some * code during suspended without invoking userspace and most of devices, * suspend_again callback is the place assuming that periodic-wakeup or * alarm-wakeup is already setup. This allows to execute some codes while * being kept suspended in the view of userland and devices. * * @end: Called by the PM core right after resuming devices, to indicate to * the platform that the system has returned to the working state or * the transition to the sleep state has been aborted. * This callback is optional, but should be implemented by the platforms * that implement @begin(). Accordingly, platforms implementing @begin() * should also provide a @end() which cleans up transitions aborted before * @enter(). * * @recover: Recover the platform from a suspend failure. * Called by the PM core if the suspending of devices fails. * This callback is optional and should only be implemented by platforms * which require special recovery actions in that situation. */ struct platform_suspend_ops { int (*valid)(suspend_state_t state); int (*begin)(suspend_state_t state); int (*prepare)(void); int (*prepare_late)(void); int (*enter)(suspend_state_t state); void (*wake)(void); void (*finish)(void); bool (*suspend_again)(void); void (*end)(void); void (*recover)(void); }; struct platform_s2idle_ops { int (*begin)(void); int (*prepare)(void); int (*prepare_late)(void); void (*check)(void); bool (*wake)(void); void (*restore_early)(void); void (*restore)(void); void (*end)(void); }; #ifdef CONFIG_SUSPEND extern suspend_state_t pm_suspend_target_state; extern suspend_state_t mem_sleep_current; extern suspend_state_t mem_sleep_default; /** * suspend_set_ops - set platform dependent suspend operations * @ops: The new suspend operations to set. */ extern void suspend_set_ops(const struct platform_suspend_ops *ops); extern int suspend_valid_only_mem(suspend_state_t state); extern unsigned int pm_suspend_global_flags; #define PM_SUSPEND_FLAG_FW_SUSPEND BIT(0) #define PM_SUSPEND_FLAG_FW_RESUME BIT(1) #define PM_SUSPEND_FLAG_NO_PLATFORM BIT(2) static inline void pm_suspend_clear_flags(void) { pm_suspend_global_flags = 0; } static inline void pm_set_suspend_via_firmware(void) { pm_suspend_global_flags |= PM_SUSPEND_FLAG_FW_SUSPEND; } static inline void pm_set_resume_via_firmware(void) { pm_suspend_global_flags |= PM_SUSPEND_FLAG_FW_RESUME; } static inline void pm_set_suspend_no_platform(void) { pm_suspend_global_flags |= PM_SUSPEND_FLAG_NO_PLATFORM; } /** * pm_suspend_via_firmware - Check if platform firmware will suspend the system. * * To be called during system-wide power management transitions to sleep states * or during the subsequent system-wide transitions back to the working state. * * Return 'true' if the platform firmware is going to be invoked at the end of * the system-wide power management transition (to a sleep state) in progress in * order to complete it, or if the platform firmware has been invoked in order * to complete the last (or preceding) transition of the system to a sleep * state. * * This matters if the caller needs or wants to carry out some special actions * depending on whether or not control will be passed to the platform firmware * subsequently (for example, the device may need to be reset before letting the * platform firmware manipulate it, which is not necessary when the platform * firmware is not going to be invoked) or when such special actions may have * been carried out during the preceding transition of the system to a sleep * state (as they may need to be taken into account). */ static inline bool pm_suspend_via_firmware(void) { return !!(pm_suspend_global_flags & PM_SUSPEND_FLAG_FW_SUSPEND); } /** * pm_resume_via_firmware - Check if platform firmware has woken up the system. * * To be called during system-wide power management transitions from sleep * states. * * Return 'true' if the platform firmware has passed control to the kernel at * the beginning of the system-wide power management transition in progress, so * the event that woke up the system from sleep has been handled by the platform * firmware. */ static inline bool pm_resume_via_firmware(void) { return !!(pm_suspend_global_flags & PM_SUSPEND_FLAG_FW_RESUME); } /** * pm_suspend_no_platform - Check if platform may change device power states. * * To be called during system-wide power management transitions to sleep states * or during the subsequent system-wide transitions back to the working state. * * Return 'true' if the power states of devices remain under full control of the * kernel throughout the system-wide suspend and resume cycle in progress (that * is, if a device is put into a certain power state during suspend, it can be * expected to remain in that state during resume). */ static inline bool pm_suspend_no_platform(void) { return !!(pm_suspend_global_flags & PM_SUSPEND_FLAG_NO_PLATFORM); } /* Suspend-to-idle state machnine. */ enum s2idle_states { S2IDLE_STATE_NONE, /* Not suspended/suspending. */ S2IDLE_STATE_ENTER, /* Enter suspend-to-idle. */ S2IDLE_STATE_WAKE, /* Wake up from suspend-to-idle. */ }; extern enum s2idle_states __read_mostly s2idle_state; static inline bool idle_should_enter_s2idle(void) { return unlikely(s2idle_state == S2IDLE_STATE_ENTER); } extern bool pm_suspend_default_s2idle(void); extern void __init pm_states_init(void); extern void s2idle_set_ops(const struct platform_s2idle_ops *ops); extern void s2idle_wake(void); /** * arch_suspend_disable_irqs - disable IRQs for suspend * * Disables IRQs (in the default case). This is a weak symbol in the common * code and thus allows architectures to override it if more needs to be * done. Not called for suspend to disk. */ extern void arch_suspend_disable_irqs(void); /** * arch_suspend_enable_irqs - enable IRQs after suspend * * Enables IRQs (in the default case). This is a weak symbol in the common * code and thus allows architectures to override it if more needs to be * done. Not called for suspend to disk. */ extern void arch_suspend_enable_irqs(void); extern int pm_suspend(suspend_state_t state); extern bool sync_on_suspend_enabled; #else /* !CONFIG_SUSPEND */ #define suspend_valid_only_mem NULL #define pm_suspend_target_state (PM_SUSPEND_ON) static inline void pm_suspend_clear_flags(void) {} static inline void pm_set_suspend_via_firmware(void) {} static inline void pm_set_resume_via_firmware(void) {} static inline bool pm_suspend_via_firmware(void) { return false; } static inline bool pm_resume_via_firmware(void) { return false; } static inline bool pm_suspend_no_platform(void) { return false; } static inline bool pm_suspend_default_s2idle(void) { return false; } static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {} static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; } static inline bool sync_on_suspend_enabled(void) { return true; } static inline bool idle_should_enter_s2idle(void) { return false; } static inline void __init pm_states_init(void) {} static inline void s2idle_set_ops(const struct platform_s2idle_ops *ops) {} static inline void s2idle_wake(void) {} #endif /* !CONFIG_SUSPEND */ /* struct pbe is used for creating lists of pages that should be restored * atomically during the resume from disk, because the page frames they have * occupied before the suspend are in use. */ struct pbe { void *address; /* address of the copy */ void *orig_address; /* original address of a page */ struct pbe *next; }; /** * struct platform_hibernation_ops - hibernation platform support * * The methods in this structure allow a platform to carry out special * operations required by it during a hibernation transition. * * All the methods below, except for @recover(), must be implemented. * * @begin: Tell the platform driver that we're starting hibernation. * Called right after shrinking memory and before freezing devices. * * @end: Called by the PM core right after resuming devices, to indicate to * the platform that the system has returned to the working state. * * @pre_snapshot: Prepare the platform for creating the hibernation image. * Called right after devices have been frozen and before the nonboot * CPUs are disabled (runs with IRQs on). * * @finish: Restore the previous state of the platform after the hibernation * image has been created *or* put the platform into the normal operation * mode after the hibernation (the same method is executed in both cases). * Called right after the nonboot CPUs have been enabled and before * thawing devices (runs with IRQs on). * * @prepare: Prepare the platform for entering the low power state. * Called right after the hibernation image has been saved and before * devices are prepared for entering the low power state. * * @enter: Put the system into the low power state after the hibernation image * has been saved to disk. * Called after the nonboot CPUs have been disabled and all of the low * level devices have been shut down (runs with IRQs off). * * @leave: Perform the first stage of the cleanup after the system sleep state * indicated by @set_target() has been left. * Called right after the control has been passed from the boot kernel to * the image kernel, before the nonboot CPUs are enabled and before devices * are resumed. Executed with interrupts disabled. * * @pre_restore: Prepare system for the restoration from a hibernation image. * Called right after devices have been frozen and before the nonboot * CPUs are disabled (runs with IRQs on). * * @restore_cleanup: Clean up after a failing image restoration. * Called right after the nonboot CPUs have been enabled and before * thawing devices (runs with IRQs on). * * @recover: Recover the platform from a failure to suspend devices. * Called by the PM core if the suspending of devices during hibernation * fails. This callback is optional and should only be implemented by * platforms which require special recovery actions in that situation. */ struct platform_hibernation_ops { int (*begin)(pm_message_t stage); void (*end)(void); int (*pre_snapshot)(void); void (*finish)(void); int (*prepare)(void); int (*enter)(void); void (*leave)(void); int (*pre_restore)(void); void (*restore_cleanup)(void); void (*recover)(void); }; #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ extern void register_nosave_region(unsigned long b, unsigned long e); extern int swsusp_page_is_forbidden(struct page *); extern void swsusp_set_page_free(struct page *); extern void swsusp_unset_page_free(struct page *); extern unsigned long get_safe_page(gfp_t gfp_mask); extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); extern u32 swsusp_hardware_signature; extern void hibernation_set_ops(const struct platform_hibernation_ops *ops); extern int hibernate(void); extern bool system_entering_hibernation(void); extern bool hibernation_available(void); asmlinkage int swsusp_save(void); extern struct pbe *restore_pblist; int pfn_is_nosave(unsigned long pfn); int hibernate_quiet_exec(int (*func)(void *data), void *data); int hibernate_resume_nonboot_cpu_disable(void); int arch_hibernation_header_save(void *addr, unsigned int max_size); int arch_hibernation_header_restore(void *addr); #else /* CONFIG_HIBERNATION */ static inline void register_nosave_region(unsigned long b, unsigned long e) {} static inline int swsusp_page_is_forbidden(struct page *p) { return 0; } static inline void swsusp_set_page_free(struct page *p) {} static inline void swsusp_unset_page_free(struct page *p) {} static inline void hibernation_set_ops(const struct platform_hibernation_ops *ops) {} static inline int hibernate(void) { return -ENOSYS; } static inline bool system_entering_hibernation(void) { return false; } static inline bool hibernation_available(void) { return false; } static inline int hibernate_quiet_exec(int (*func)(void *data), void *data) { return -ENOTSUPP; } #endif /* CONFIG_HIBERNATION */ int arch_resume_nosmt(void); #ifdef CONFIG_HIBERNATION_SNAPSHOT_DEV int is_hibernate_resume_dev(dev_t dev); #else static inline int is_hibernate_resume_dev(dev_t dev) { return 0; } #endif /* Hibernation and suspend events */ #define PM_HIBERNATION_PREPARE 0x0001 /* Going to hibernate */ #define PM_POST_HIBERNATION 0x0002 /* Hibernation finished */ #define PM_SUSPEND_PREPARE 0x0003 /* Going to suspend the system */ #define PM_POST_SUSPEND 0x0004 /* Suspend finished */ #define PM_RESTORE_PREPARE 0x0005 /* Going to restore a saved image */ #define PM_POST_RESTORE 0x0006 /* Restore failed */ extern struct mutex system_transition_mutex; #ifdef CONFIG_PM_SLEEP void save_processor_state(void); void restore_processor_state(void); /* kernel/power/main.c */ extern int register_pm_notifier(struct notifier_block *nb); extern int unregister_pm_notifier(struct notifier_block *nb); extern void ksys_sync_helper(void); extern void pm_report_hw_sleep_time(u64 t); extern void pm_report_max_hw_sleep(u64 t); #define pm_notifier(fn, pri) { \ static struct notifier_block fn##_nb = \ { .notifier_call = fn, .priority = pri }; \ register_pm_notifier(&fn##_nb); \ } /* drivers/base/power/wakeup.c */ extern bool events_check_enabled; static inline bool pm_suspended_storage(void) { return !gfp_has_io_fs(gfp_allowed_mask); } extern bool pm_wakeup_pending(void); extern void pm_system_wakeup(void); extern void pm_system_cancel_wakeup(void); extern void pm_wakeup_clear(unsigned int irq_number); extern void pm_system_irq_wakeup(unsigned int irq_number); extern unsigned int pm_wakeup_irq(void); extern bool pm_get_wakeup_count(unsigned int *count, bool block); extern bool pm_save_wakeup_count(unsigned int count); extern void pm_wakep_autosleep_enabled(bool set); extern void pm_print_active_wakeup_sources(void); extern unsigned int lock_system_sleep(void); extern void unlock_system_sleep(unsigned int); #else /* !CONFIG_PM_SLEEP */ static inline int register_pm_notifier(struct notifier_block *nb) { return 0; } static inline int unregister_pm_notifier(struct notifier_block *nb) { return 0; } static inline void pm_report_hw_sleep_time(u64 t) {}; static inline void pm_report_max_hw_sleep(u64 t) {}; static inline void ksys_sync_helper(void) {} #define pm_notifier(fn, pri) do { (void)(fn); } while (0) static inline bool pm_suspended_storage(void) { return false; } static inline bool pm_wakeup_pending(void) { return false; } static inline void pm_system_wakeup(void) {} static inline void pm_wakeup_clear(bool reset) {} static inline void pm_system_irq_wakeup(unsigned int irq_number) {} static inline unsigned int lock_system_sleep(void) { return 0; } static inline void unlock_system_sleep(unsigned int flags) {} #endif /* !CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG extern bool pm_print_times_enabled; extern bool pm_debug_messages_on; extern bool pm_debug_messages_should_print(void); static inline int pm_dyn_debug_messages_on(void) { #ifdef CONFIG_DYNAMIC_DEBUG return 1; #else return 0; #endif } #ifndef pr_fmt #define pr_fmt(fmt) "PM: " fmt #endif #define __pm_pr_dbg(fmt, ...) \ do { \ if (pm_debug_messages_should_print()) \ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ else if (pm_dyn_debug_messages_on()) \ pr_debug(fmt, ##__VA_ARGS__); \ } while (0) #define __pm_deferred_pr_dbg(fmt, ...) \ do { \ if (pm_debug_messages_should_print()) \ printk_deferred(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ } while (0) #else #define pm_print_times_enabled (false) #define pm_debug_messages_on (false) #include <linux/printk.h> #define __pm_pr_dbg(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #define __pm_deferred_pr_dbg(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /** * pm_pr_dbg - print pm sleep debug messages * * If pm_debug_messages_on is enabled and the system is entering/leaving * suspend, print message. * If pm_debug_messages_on is disabled and CONFIG_DYNAMIC_DEBUG is enabled, * print message only from instances explicitly enabled on dynamic debug's * control. * If pm_debug_messages_on is disabled and CONFIG_DYNAMIC_DEBUG is disabled, * don't print message. */ #define pm_pr_dbg(fmt, ...) \ __pm_pr_dbg(fmt, ##__VA_ARGS__) #define pm_deferred_pr_dbg(fmt, ...) \ __pm_deferred_pr_dbg(fmt, ##__VA_ARGS__) #ifdef CONFIG_PM_AUTOSLEEP /* kernel/power/autosleep.c */ void queue_up_suspend_work(void); #else /* !CONFIG_PM_AUTOSLEEP */ static inline void queue_up_suspend_work(void) {} #endif /* !CONFIG_PM_AUTOSLEEP */ enum suspend_stat_step { SUSPEND_WORKING = 0, SUSPEND_FREEZE, SUSPEND_PREPARE, SUSPEND_SUSPEND, SUSPEND_SUSPEND_LATE, SUSPEND_SUSPEND_NOIRQ, SUSPEND_RESUME_NOIRQ, SUSPEND_RESUME_EARLY, SUSPEND_RESUME }; void dpm_save_failed_dev(const char *name); void dpm_save_failed_step(enum suspend_stat_step step); #endif /* _LINUX_SUSPEND_H */
7 13 13 1 1 26 28 1 1 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 5 5 5 5 5 2 2 4 5 5 5 5 5 5 5 5 1 5 5 4 1 16 1 16 3 3 3 3 3 3 1 4 3 4 2 4 4 4 4 4 4 4 12 12 12 12 12 1 4 16 16 16 16 16 16 16 16 16 16 16 16 16 4 7 7 7 5 7 7 8 8 7 7 7 7 7 1 1 1 1 1 3 1 6 4 6 6 6 4 7 7 7 1 1 1 1 1 2 1 1 5 5 4 8 8 7 7 7 3 3 5 5 5 4 5 4 4 8 4 4 4 1 4 1 1 1 5 1 5 5 5 5 5 1 4 1 1 1 5 5 5 5 1 5 4 1 5 1 5 1 1 1 5 5 1 5 5 5 5 4 5 5 5 8 8 8 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 2 26 25 1 1 1 1 24 24 7 7 4 1 1 1 1 1 1 1 8 8 5 1 1 1 1 1 1 1 1 2 1 3 14 12 26 26 1 1 1 1 1 1 44 44 44 53 44 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 // SPDX-License-Identifier: GPL-2.0+ /*****************************************************************************/ /* * devio.c -- User space communication with USB devices. * * Copyright (C) 1999-2000 Thomas Sailer (sailer@ife.ee.ethz.ch) * * This file implements the usbfs/x/y files, where * x is the bus number and y the device number. * * It allows user space programs/"drivers" to communicate directly * with USB devices without intervening kernel driver. * * Revision history * 22.12.1999 0.1 Initial release (split from proc_usb.c) * 04.01.2000 0.2 Turned into its own filesystem * 30.09.2005 0.3 Fix user-triggerable oops in async URB delivery * (CAN-2005-3055) */ /*****************************************************************************/ #include <linux/fs.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/signal.h> #include <linux/poll.h> #include <linux/module.h> #include <linux/string.h> #include <linux/usb.h> #include <linux/usbdevice_fs.h> #include <linux/usb/hcd.h> /* for usbcore internals */ #include <linux/usb/quirks.h> #include <linux/cdev.h> #include <linux/notifier.h> #include <linux/security.h> #include <linux/user_namespace.h> #include <linux/scatterlist.h> #include <linux/uaccess.h> #include <linux/dma-mapping.h> #include <asm/byteorder.h> #include <linux/moduleparam.h> #include "usb.h" #ifdef CONFIG_PM #define MAYBE_CAP_SUSPEND USBDEVFS_CAP_SUSPEND #else #define MAYBE_CAP_SUSPEND 0 #endif #define USB_MAXBUS 64 #define USB_DEVICE_MAX (USB_MAXBUS * 128) #define USB_SG_SIZE 16384 /* split-size for large txs */ /* Mutual exclusion for ps->list in resume vs. release and remove */ static DEFINE_MUTEX(usbfs_mutex); struct usb_dev_state { struct list_head list; /* state list */ struct usb_device *dev; struct file *file; spinlock_t lock; /* protects the async urb lists */ struct list_head async_pending; struct list_head async_completed; struct list_head memory_list; wait_queue_head_t wait; /* wake up if a request completed */ wait_queue_head_t wait_for_resume; /* wake up upon runtime resume */ unsigned int discsignr; struct pid *disc_pid; const struct cred *cred; sigval_t disccontext; unsigned long ifclaimed; u32 disabled_bulk_eps; unsigned long interface_allowed_mask; int not_yet_resumed; bool suspend_allowed; bool privileges_dropped; }; struct usb_memory { struct list_head memlist; int vma_use_count; int urb_use_count; u32 size; void *mem; dma_addr_t dma_handle; unsigned long vm_start; struct usb_dev_state *ps; }; struct async { struct list_head asynclist; struct usb_dev_state *ps; struct pid *pid; const struct cred *cred; unsigned int signr; unsigned int ifnum; void __user *userbuffer; void __user *userurb; sigval_t userurb_sigval; struct urb *urb; struct usb_memory *usbm; unsigned int mem_usage; int status; u8 bulk_addr; u8 bulk_status; }; static bool usbfs_snoop; module_param(usbfs_snoop, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(usbfs_snoop, "true to log all usbfs traffic"); static unsigned usbfs_snoop_max = 65536; module_param(usbfs_snoop_max, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(usbfs_snoop_max, "maximum number of bytes to print while snooping"); #define snoop(dev, format, arg...) \ do { \ if (usbfs_snoop) \ dev_info(dev, format, ## arg); \ } while (0) enum snoop_when { SUBMIT, COMPLETE }; #define USB_DEVICE_DEV MKDEV(USB_DEVICE_MAJOR, 0) /* Limit on the total amount of memory we can allocate for transfers */ static u32 usbfs_memory_mb = 16; module_param(usbfs_memory_mb, uint, 0644); MODULE_PARM_DESC(usbfs_memory_mb, "maximum MB allowed for usbfs buffers (0 = no limit)"); /* Hard limit, necessary to avoid arithmetic overflow */ #define USBFS_XFER_MAX (UINT_MAX / 2 - 1000000) static DEFINE_SPINLOCK(usbfs_memory_usage_lock); static u64 usbfs_memory_usage; /* Total memory currently allocated */ /* Check whether it's okay to allocate more memory for a transfer */ static int usbfs_increase_memory_usage(u64 amount) { u64 lim, total_mem; unsigned long flags; int ret; lim = READ_ONCE(usbfs_memory_mb); lim <<= 20; ret = 0; spin_lock_irqsave(&usbfs_memory_usage_lock, flags); total_mem = usbfs_memory_usage + amount; if (lim > 0 && total_mem > lim) ret = -ENOMEM; else usbfs_memory_usage = total_mem; spin_unlock_irqrestore(&usbfs_memory_usage_lock, flags); return ret; } /* Memory for a transfer is being deallocated */ static void usbfs_decrease_memory_usage(u64 amount) { unsigned long flags; spin_lock_irqsave(&usbfs_memory_usage_lock, flags); if (amount > usbfs_memory_usage) usbfs_memory_usage = 0; else usbfs_memory_usage -= amount; spin_unlock_irqrestore(&usbfs_memory_usage_lock, flags); } static int connected(struct usb_dev_state *ps) { return (!list_empty(&ps->list) && ps->dev->state != USB_STATE_NOTATTACHED); } static void dec_usb_memory_use_count(struct usb_memory *usbm, int *count) { struct usb_dev_state *ps = usbm->ps; struct usb_hcd *hcd = bus_to_hcd(ps->dev->bus); unsigned long flags; spin_lock_irqsave(&ps->lock, flags); --*count; if (usbm->urb_use_count == 0 && usbm->vma_use_count == 0) { list_del(&usbm->memlist); spin_unlock_irqrestore(&ps->lock, flags); hcd_buffer_free_pages(hcd, usbm->size, usbm->mem, usbm->dma_handle); usbfs_decrease_memory_usage( usbm->size + sizeof(struct usb_memory)); kfree(usbm); } else { spin_unlock_irqrestore(&ps->lock, flags); } } static void usbdev_vm_open(struct vm_area_struct *vma) { struct usb_memory *usbm = vma->vm_private_data; unsigned long flags; spin_lock_irqsave(&usbm->ps->lock, flags); ++usbm->vma_use_count; spin_unlock_irqrestore(&usbm->ps->lock, flags); } static void usbdev_vm_close(struct vm_area_struct *vma) { struct usb_memory *usbm = vma->vm_private_data; dec_usb_memory_use_count(usbm, &usbm->vma_use_count); } static const struct vm_operations_struct usbdev_vm_ops = { .open = usbdev_vm_open, .close = usbdev_vm_close }; static int usbdev_mmap(struct file *file, struct vm_area_struct *vma) { struct usb_memory *usbm = NULL; struct usb_dev_state *ps = file->private_data; struct usb_hcd *hcd = bus_to_hcd(ps->dev->bus); size_t size = vma->vm_end - vma->vm_start; void *mem; unsigned long flags; dma_addr_t dma_handle = DMA_MAPPING_ERROR; int ret; ret = usbfs_increase_memory_usage(size + sizeof(struct usb_memory)); if (ret) goto error; usbm = kzalloc(sizeof(struct usb_memory), GFP_KERNEL); if (!usbm) { ret = -ENOMEM; goto error_decrease_mem; } mem = hcd_buffer_alloc_pages(hcd, size, GFP_USER | __GFP_NOWARN, &dma_handle); if (!mem) { ret = -ENOMEM; goto error_free_usbm; } memset(mem, 0, size); usbm->mem = mem; usbm->dma_handle = dma_handle; usbm->size = size; usbm->ps = ps; usbm->vm_start = vma->vm_start; usbm->vma_use_count = 1; INIT_LIST_HEAD(&usbm->memlist); /* * In DMA-unavailable cases, hcd_buffer_alloc_pages allocates * normal pages and assigns DMA_MAPPING_ERROR to dma_handle. Check * whether we are in such cases, and then use remap_pfn_range (or * dma_mmap_coherent) to map normal (or DMA) pages into the user * space, respectively. */ if (dma_handle == DMA_MAPPING_ERROR) { if (remap_pfn_range(vma, vma->vm_start, virt_to_phys(usbm->mem) >> PAGE_SHIFT, size, vma->vm_page_prot) < 0) { dec_usb_memory_use_count(usbm, &usbm->vma_use_count); return -EAGAIN; } } else { if (dma_mmap_coherent(hcd->self.sysdev, vma, mem, dma_handle, size)) { dec_usb_memory_use_count(usbm, &usbm->vma_use_count); return -EAGAIN; } } vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &usbdev_vm_ops; vma->vm_private_data = usbm; spin_lock_irqsave(&ps->lock, flags); list_add_tail(&usbm->memlist, &ps->memory_list); spin_unlock_irqrestore(&ps->lock, flags); return 0; error_free_usbm: kfree(usbm); error_decrease_mem: usbfs_decrease_memory_usage(size + sizeof(struct usb_memory)); error: return ret; } static ssize_t usbdev_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { struct usb_dev_state *ps = file->private_data; struct usb_device *dev = ps->dev; ssize_t ret = 0; unsigned len; loff_t pos; int i; pos = *ppos; usb_lock_device(dev); if (!connected(ps)) { ret = -ENODEV; goto err; } else if (pos < 0) { ret = -EINVAL; goto err; } if (pos < sizeof(struct usb_device_descriptor)) { /* 18 bytes - fits on the stack */ struct usb_device_descriptor temp_desc; memcpy(&temp_desc, &dev->descriptor, sizeof(dev->descriptor)); le16_to_cpus(&temp_desc.bcdUSB); le16_to_cpus(&temp_desc.idVendor); le16_to_cpus(&temp_desc.idProduct); le16_to_cpus(&temp_desc.bcdDevice); len = sizeof(struct usb_device_descriptor) - pos; if (len > nbytes) len = nbytes; if (copy_to_user(buf, ((char *)&temp_desc) + pos, len)) { ret = -EFAULT; goto err; } *ppos += len; buf += len; nbytes -= len; ret += len; } pos = sizeof(struct usb_device_descriptor); for (i = 0; nbytes && i < dev->descriptor.bNumConfigurations; i++) { struct usb_config_descriptor *config = (struct usb_config_descriptor *)dev->rawdescriptors[i]; unsigned int length = le16_to_cpu(config->wTotalLength); if (*ppos < pos + length) { /* The descriptor may claim to be longer than it * really is. Here is the actual allocated length. */ unsigned alloclen = le16_to_cpu(dev->config[i].desc.wTotalLength); len = length - (*ppos - pos); if (len > nbytes) len = nbytes; /* Simply don't write (skip over) unallocated parts */ if (alloclen > (*ppos - pos)) { alloclen -= (*ppos - pos); if (copy_to_user(buf, dev->rawdescriptors[i] + (*ppos - pos), min(len, alloclen))) { ret = -EFAULT; goto err; } } *ppos += len; buf += len; nbytes -= len; ret += len; } pos += length; } err: usb_unlock_device(dev); return ret; } /* * async list handling */ static struct async *alloc_async(unsigned int numisoframes) { struct async *as; as = kzalloc(sizeof(struct async), GFP_KERNEL); if (!as) return NULL; as->urb = usb_alloc_urb(numisoframes, GFP_KERNEL); if (!as->urb) { kfree(as); return NULL; } return as; } static void free_async(struct async *as) { int i; put_pid(as->pid); if (as->cred) put_cred(as->cred); for (i = 0; i < as->urb->num_sgs; i++) { if (sg_page(&as->urb->sg[i])) kfree(sg_virt(&as->urb->sg[i])); } kfree(as->urb->sg); if (as->usbm == NULL) kfree(as->urb->transfer_buffer); else dec_usb_memory_use_count(as->usbm, &as->usbm->urb_use_count); kfree(as->urb->setup_packet); usb_free_urb(as->urb); usbfs_decrease_memory_usage(as->mem_usage); kfree(as); } static void async_newpending(struct async *as) { struct usb_dev_state *ps = as->ps; unsigned long flags; spin_lock_irqsave(&ps->lock, flags); list_add_tail(&as->asynclist, &ps->async_pending); spin_unlock_irqrestore(&ps->lock, flags); } static void async_removepending(struct async *as) { struct usb_dev_state *ps = as->ps; unsigned long flags; spin_lock_irqsave(&ps->lock, flags); list_del_init(&as->asynclist); spin_unlock_irqrestore(&ps->lock, flags); } static struct async *async_getcompleted(struct usb_dev_state *ps) { unsigned long flags; struct async *as = NULL; spin_lock_irqsave(&ps->lock, flags); if (!list_empty(&ps->async_completed)) { as = list_entry(ps->async_completed.next, struct async, asynclist); list_del_init(&as->asynclist); } spin_unlock_irqrestore(&ps->lock, flags); return as; } static struct async *async_getpending(struct usb_dev_state *ps, void __user *userurb) { struct async *as; list_for_each_entry(as, &ps->async_pending, asynclist) if (as->userurb == userurb) { list_del_init(&as->asynclist); return as; } return NULL; } static void snoop_urb(struct usb_device *udev, void __user *userurb, int pipe, unsigned length, int timeout_or_status, enum snoop_when when, unsigned char *data, unsigned data_len) { static const char *types[] = {"isoc", "int", "ctrl", "bulk"}; static const char *dirs[] = {"out", "in"}; int ep; const char *t, *d; if (!usbfs_snoop) return; ep = usb_pipeendpoint(pipe); t = types[usb_pipetype(pipe)]; d = dirs[!!usb_pipein(pipe)]; if (userurb) { /* Async */ if (when == SUBMIT) dev_info(&udev->dev, "userurb %px, ep%d %s-%s, " "length %u\n", userurb, ep, t, d, length); else dev_info(&udev->dev, "userurb %px, ep%d %s-%s, " "actual_length %u status %d\n", userurb, ep, t, d, length, timeout_or_status); } else { if (when == SUBMIT) dev_info(&udev->dev, "ep%d %s-%s, length %u, " "timeout %d\n", ep, t, d, length, timeout_or_status); else dev_info(&udev->dev, "ep%d %s-%s, actual_length %u, " "status %d\n", ep, t, d, length, timeout_or_status); } data_len = min(data_len, usbfs_snoop_max); if (data && data_len > 0) { print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_NONE, 32, 1, data, data_len, 1); } } static void snoop_urb_data(struct urb *urb, unsigned len) { int i, size; len = min(len, usbfs_snoop_max); if (!usbfs_snoop || len == 0) return; if (urb->num_sgs == 0) { print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_NONE, 32, 1, urb->transfer_buffer, len, 1); return; } for (i = 0; i < urb->num_sgs && len; i++) { size = (len > USB_SG_SIZE) ? USB_SG_SIZE : len; print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_NONE, 32, 1, sg_virt(&urb->sg[i]), size, 1); len -= size; } } static int copy_urb_data_to_user(u8 __user *userbuffer, struct urb *urb) { unsigned i, len, size; if (urb->number_of_packets > 0) /* Isochronous */ len = urb->transfer_buffer_length; else /* Non-Isoc */ len = urb->actual_length; if (urb->num_sgs == 0) { if (copy_to_user(userbuffer, urb->transfer_buffer, len)) return -EFAULT; return 0; } for (i = 0; i < urb->num_sgs && len; i++) { size = (len > USB_SG_SIZE) ? USB_SG_SIZE : len; if (copy_to_user(userbuffer, sg_virt(&urb->sg[i]), size)) return -EFAULT; userbuffer += size; len -= size; } return 0; } #define AS_CONTINUATION 1 #define AS_UNLINK 2 static void cancel_bulk_urbs(struct usb_dev_state *ps, unsigned bulk_addr) __releases(ps->lock) __acquires(ps->lock) { struct urb *urb; struct async *as; /* Mark all the pending URBs that match bulk_addr, up to but not * including the first one without AS_CONTINUATION. If such an * URB is encountered then a new transfer has already started so * the endpoint doesn't need to be disabled; otherwise it does. */ list_for_each_entry(as, &ps->async_pending, asynclist) { if (as->bulk_addr == bulk_addr) { if (as->bulk_status != AS_CONTINUATION) goto rescan; as->bulk_status = AS_UNLINK; as->bulk_addr = 0; } } ps->disabled_bulk_eps |= (1 << bulk_addr); /* Now carefully unlink all the marked pending URBs */ rescan: list_for_each_entry_reverse(as, &ps->async_pending, asynclist) { if (as->bulk_status == AS_UNLINK) { as->bulk_status = 0; /* Only once */ urb = as->urb; usb_get_urb(urb); spin_unlock(&ps->lock); /* Allow completions */ usb_unlink_urb(urb); usb_put_urb(urb); spin_lock(&ps->lock); goto rescan; } } } static void async_completed(struct urb *urb) { struct async *as = urb->context; struct usb_dev_state *ps = as->ps; struct pid *pid = NULL; const struct cred *cred = NULL; unsigned long flags; sigval_t addr; int signr, errno; spin_lock_irqsave(&ps->lock, flags); list_move_tail(&as->asynclist, &ps->async_completed); as->status = urb->status; signr = as->signr; if (signr) { errno = as->status; addr = as->userurb_sigval; pid = get_pid(as->pid); cred = get_cred(as->cred); } snoop(&urb->dev->dev, "urb complete\n"); snoop_urb(urb->dev, as->userurb, urb->pipe, urb->actual_length, as->status, COMPLETE, NULL, 0); if (usb_urb_dir_in(urb)) snoop_urb_data(urb, urb->actual_length); if (as->status < 0 && as->bulk_addr && as->status != -ECONNRESET && as->status != -ENOENT) cancel_bulk_urbs(ps, as->bulk_addr); wake_up(&ps->wait); spin_unlock_irqrestore(&ps->lock, flags); if (signr) { kill_pid_usb_asyncio(signr, errno, addr, pid, cred); put_pid(pid); put_cred(cred); } } static void destroy_async(struct usb_dev_state *ps, struct list_head *list) { struct urb *urb; struct async *as; unsigned long flags; spin_lock_irqsave(&ps->lock, flags); while (!list_empty(list)) { as = list_last_entry(list, struct async, asynclist); list_del_init(&as->asynclist); urb = as->urb; usb_get_urb(urb); /* drop the spinlock so the completion handler can run */ spin_unlock_irqrestore(&ps->lock, flags); usb_kill_urb(urb); usb_put_urb(urb); spin_lock_irqsave(&ps->lock, flags); } spin_unlock_irqrestore(&ps->lock, flags); } static void destroy_async_on_interface(struct usb_dev_state *ps, unsigned int ifnum) { struct list_head *p, *q, hitlist; unsigned long flags; INIT_LIST_HEAD(&hitlist); spin_lock_irqsave(&ps->lock, flags); list_for_each_safe(p, q, &ps->async_pending) if (ifnum == list_entry(p, struct async, asynclist)->ifnum) list_move_tail(p, &hitlist); spin_unlock_irqrestore(&ps->lock, flags); destroy_async(ps, &hitlist); } static void destroy_all_async(struct usb_dev_state *ps) { destroy_async(ps, &ps->async_pending); } /* * interface claims are made only at the request of user level code, * which can also release them (explicitly or by closing files). * they're also undone when devices disconnect. */ static int driver_probe(struct usb_interface *intf, const struct usb_device_id *id) { return -ENODEV; } static void driver_disconnect(struct usb_interface *intf) { struct usb_dev_state *ps = usb_get_intfdata(intf); unsigned int ifnum = intf->altsetting->desc.bInterfaceNumber; if (!ps) return; /* NOTE: this relies on usbcore having canceled and completed * all pending I/O requests; 2.6 does that. */ if (likely(ifnum < 8*sizeof(ps->ifclaimed))) clear_bit(ifnum, &ps->ifclaimed); else dev_warn(&intf->dev, "interface number %u out of range\n", ifnum); usb_set_intfdata(intf, NULL); /* force async requests to complete */ destroy_async_on_interface(ps, ifnum); } /* We don't care about suspend/resume of claimed interfaces */ static int driver_suspend(struct usb_interface *intf, pm_message_t msg) { return 0; } static int driver_resume(struct usb_interface *intf) { return 0; } #ifdef CONFIG_PM /* The following routines apply to the entire device, not interfaces */ void usbfs_notify_suspend(struct usb_device *udev) { /* We don't need to handle this */ } void usbfs_notify_resume(struct usb_device *udev) { struct usb_dev_state *ps; /* Protect against simultaneous remove or release */ mutex_lock(&usbfs_mutex); list_for_each_entry(ps, &udev->filelist, list) { WRITE_ONCE(ps->not_yet_resumed, 0); wake_up_all(&ps->wait_for_resume); } mutex_unlock(&usbfs_mutex); } #endif struct usb_driver usbfs_driver = { .name = "usbfs", .probe = driver_probe, .disconnect = driver_disconnect, .suspend = driver_suspend, .resume = driver_resume, .supports_autosuspend = 1, }; static int claimintf(struct usb_dev_state *ps, unsigned int ifnum) { struct usb_device *dev = ps->dev; struct usb_interface *intf; int err; if (ifnum >= 8*sizeof(ps->ifclaimed)) return -EINVAL; /* already claimed */ if (test_bit(ifnum, &ps->ifclaimed)) return 0; if (ps->privileges_dropped && !test_bit(ifnum, &ps->interface_allowed_mask)) return -EACCES; intf = usb_ifnum_to_if(dev, ifnum); if (!intf) err = -ENOENT; else { unsigned int old_suppress; /* suppress uevents while claiming interface */ old_suppress = dev_get_uevent_suppress(&intf->dev); dev_set_uevent_suppress(&intf->dev, 1); err = usb_driver_claim_interface(&usbfs_driver, intf, ps); dev_set_uevent_suppress(&intf->dev, old_suppress); } if (err == 0) set_bit(ifnum, &ps->ifclaimed); return err; } static int releaseintf(struct usb_dev_state *ps, unsigned int ifnum) { struct usb_device *dev; struct usb_interface *intf; int err; err = -EINVAL; if (ifnum >= 8*sizeof(ps->ifclaimed)) return err; dev = ps->dev; intf = usb_ifnum_to_if(dev, ifnum); if (!intf) err = -ENOENT; else if (test_and_clear_bit(ifnum, &ps->ifclaimed)) { unsigned int old_suppress; /* suppress uevents while releasing interface */ old_suppress = dev_get_uevent_suppress(&intf->dev); dev_set_uevent_suppress(&intf->dev, 1); usb_driver_release_interface(&usbfs_driver, intf); dev_set_uevent_suppress(&intf->dev, old_suppress); err = 0; } return err; } static int checkintf(struct usb_dev_state *ps, unsigned int ifnum) { if (ps->dev->state != USB_STATE_CONFIGURED) return -EHOSTUNREACH; if (ifnum >= 8*sizeof(ps->ifclaimed)) return -EINVAL; if (test_bit(ifnum, &ps->ifclaimed)) return 0; /* if not yet claimed, claim it for the driver */ dev_warn(&ps->dev->dev, "usbfs: process %d (%s) did not claim " "interface %u before use\n", task_pid_nr(current), current->comm, ifnum); return claimintf(ps, ifnum); } static int findintfep(struct usb_device *dev, unsigned int ep) { unsigned int i, j, e; struct usb_interface *intf; struct usb_host_interface *alts; struct usb_endpoint_descriptor *endpt; if (ep & ~(USB_DIR_IN|0xf)) return -EINVAL; if (!dev->actconfig) return -ESRCH; for (i = 0; i < dev->actconfig->desc.bNumInterfaces; i++) { intf = dev->actconfig->interface[i]; for (j = 0; j < intf->num_altsetting; j++) { alts = &intf->altsetting[j]; for (e = 0; e < alts->desc.bNumEndpoints; e++) { endpt = &alts->endpoint[e].desc; if (endpt->bEndpointAddress == ep) return alts->desc.bInterfaceNumber; } } } return -ENOENT; } static int check_ctrlrecip(struct usb_dev_state *ps, unsigned int requesttype, unsigned int request, unsigned int index) { int ret = 0; struct usb_host_interface *alt_setting; if (ps->dev->state != USB_STATE_UNAUTHENTICATED && ps->dev->state != USB_STATE_ADDRESS && ps->dev->state != USB_STATE_CONFIGURED) return -EHOSTUNREACH; if (USB_TYPE_VENDOR == (USB_TYPE_MASK & requesttype)) return 0; /* * check for the special corner case 'get_device_id' in the printer * class specification, which we always want to allow as it is used * to query things like ink level, etc. */ if (requesttype == 0xa1 && request == 0) { alt_setting = usb_find_alt_setting(ps->dev->actconfig, index >> 8, index & 0xff); if (alt_setting && alt_setting->desc.bInterfaceClass == USB_CLASS_PRINTER) return 0; } index &= 0xff; switch (requesttype & USB_RECIP_MASK) { case USB_RECIP_ENDPOINT: if ((index & ~USB_DIR_IN) == 0) return 0; ret = findintfep(ps->dev, index); if (ret < 0) { /* * Some not fully compliant Win apps seem to get * index wrong and have the endpoint number here * rather than the endpoint address (with the * correct direction). Win does let this through, * so we'll not reject it here but leave it to * the device to not break KVM. But we warn. */ ret = findintfep(ps->dev, index ^ 0x80); if (ret >= 0) dev_info(&ps->dev->dev, "%s: process %i (%s) requesting ep %02x but needs %02x\n", __func__, task_pid_nr(current), current->comm, index, index ^ 0x80); } if (ret >= 0) ret = checkintf(ps, ret); break; case USB_RECIP_INTERFACE: ret = checkintf(ps, index); break; } return ret; } static struct usb_host_endpoint *ep_to_host_endpoint(struct usb_device *dev, unsigned char ep) { if (ep & USB_ENDPOINT_DIR_MASK) return dev->ep_in[ep & USB_ENDPOINT_NUMBER_MASK]; else return dev->ep_out[ep & USB_ENDPOINT_NUMBER_MASK]; } static int parse_usbdevfs_streams(struct usb_dev_state *ps, struct usbdevfs_streams __user *streams, unsigned int *num_streams_ret, unsigned int *num_eps_ret, struct usb_host_endpoint ***eps_ret, struct usb_interface **intf_ret) { unsigned int i, num_streams, num_eps; struct usb_host_endpoint **eps; struct usb_interface *intf = NULL; unsigned char ep; int ifnum, ret; if (get_user(num_streams, &streams->num_streams) || get_user(num_eps, &streams->num_eps)) return -EFAULT; if (num_eps < 1 || num_eps > USB_MAXENDPOINTS) return -EINVAL; /* The XHCI controller allows max 2 ^ 16 streams */ if (num_streams_ret && (num_streams < 2 || num_streams > 65536)) return -EINVAL; eps = kmalloc_array(num_eps, sizeof(*eps), GFP_KERNEL); if (!eps) return -ENOMEM; for (i = 0; i < num_eps; i++) { if (get_user(ep, &streams->eps[i])) { ret = -EFAULT; goto error; } eps[i] = ep_to_host_endpoint(ps->dev, ep); if (!eps[i]) { ret = -EINVAL; goto error; } /* usb_alloc/free_streams operate on an usb_interface */ ifnum = findintfep(ps->dev, ep); if (ifnum < 0) { ret = ifnum; goto error; } if (i == 0) { ret = checkintf(ps, ifnum); if (ret < 0) goto error; intf = usb_ifnum_to_if(ps->dev, ifnum); } else { /* Verify all eps belong to the same interface */ if (ifnum != intf->altsetting->desc.bInterfaceNumber) { ret = -EINVAL; goto error; } } } if (num_streams_ret) *num_streams_ret = num_streams; *num_eps_ret = num_eps; *eps_ret = eps; *intf_ret = intf; return 0; error: kfree(eps); return ret; } static struct usb_device *usbdev_lookup_by_devt(dev_t devt) { struct device *dev; dev = bus_find_device_by_devt(&usb_bus_type, devt); if (!dev) return NULL; return to_usb_device(dev); } /* * file operations */ static int usbdev_open(struct inode *inode, struct file *file) { struct usb_device *dev = NULL; struct usb_dev_state *ps; int ret; ret = -ENOMEM; ps = kzalloc(sizeof(struct usb_dev_state), GFP_KERNEL); if (!ps) goto out_free_ps; ret = -ENODEV; /* usbdev device-node */ if (imajor(inode) == USB_DEVICE_MAJOR) dev = usbdev_lookup_by_devt(inode->i_rdev); if (!dev) goto out_free_ps; usb_lock_device(dev); if (dev->state == USB_STATE_NOTATTACHED) goto out_unlock_device; ret = usb_autoresume_device(dev); if (ret) goto out_unlock_device; ps->dev = dev; ps->file = file; ps->interface_allowed_mask = 0xFFFFFFFF; /* 32 bits */ spin_lock_init(&ps->lock); INIT_LIST_HEAD(&ps->list); INIT_LIST_HEAD(&ps->async_pending); INIT_LIST_HEAD(&ps->async_completed); INIT_LIST_HEAD(&ps->memory_list); init_waitqueue_head(&ps->wait); init_waitqueue_head(&ps->wait_for_resume); ps->disc_pid = get_pid(task_pid(current)); ps->cred = get_current_cred(); smp_wmb(); /* Can't race with resume; the device is already active */ list_add_tail(&ps->list, &dev->filelist); file->private_data = ps; usb_unlock_device(dev); snoop(&dev->dev, "opened by process %d: %s\n", task_pid_nr(current), current->comm); return ret; out_unlock_device: usb_unlock_device(dev); usb_put_dev(dev); out_free_ps: kfree(ps); return ret; } static int usbdev_release(struct inode *inode, struct file *file) { struct usb_dev_state *ps = file->private_data; struct usb_device *dev = ps->dev; unsigned int ifnum; struct async *as; usb_lock_device(dev); usb_hub_release_all_ports(dev, ps); /* Protect against simultaneous resume */ mutex_lock(&usbfs_mutex); list_del_init(&ps->list); mutex_unlock(&usbfs_mutex); for (ifnum = 0; ps->ifclaimed && ifnum < 8*sizeof(ps->ifclaimed); ifnum++) { if (test_bit(ifnum, &ps->ifclaimed)) releaseintf(ps, ifnum); } destroy_all_async(ps); if (!ps->suspend_allowed) usb_autosuspend_device(dev); usb_unlock_device(dev); usb_put_dev(dev); put_pid(ps->disc_pid); put_cred(ps->cred); as = async_getcompleted(ps); while (as) { free_async(as); as = async_getcompleted(ps); } kfree(ps); return 0; } static void usbfs_blocking_completion(struct urb *urb) { complete((struct completion *) urb->context); } /* * Much like usb_start_wait_urb, but returns status separately from * actual_length and uses a killable wait. */ static int usbfs_start_wait_urb(struct urb *urb, int timeout, unsigned int *actlen) { DECLARE_COMPLETION_ONSTACK(ctx); unsigned long expire; int rc; urb->context = &ctx; urb->complete = usbfs_blocking_completion; *actlen = 0; rc = usb_submit_urb(urb, GFP_KERNEL); if (unlikely(rc)) return rc; expire = (timeout ? msecs_to_jiffies(timeout) : MAX_SCHEDULE_TIMEOUT); rc = wait_for_completion_killable_timeout(&ctx, expire); if (rc <= 0) { usb_kill_urb(urb); *actlen = urb->actual_length; if (urb->status != -ENOENT) ; /* Completed before it was killed */ else if (rc < 0) return -EINTR; else return -ETIMEDOUT; } *actlen = urb->actual_length; return urb->status; } static int do_proc_control(struct usb_dev_state *ps, struct usbdevfs_ctrltransfer *ctrl) { struct usb_device *dev = ps->dev; unsigned int tmo; unsigned char *tbuf; unsigned int wLength, actlen; int i, pipe, ret; struct urb *urb = NULL; struct usb_ctrlrequest *dr = NULL; ret = check_ctrlrecip(ps, ctrl->bRequestType, ctrl->bRequest, ctrl->wIndex); if (ret) return ret; wLength = ctrl->wLength; /* To suppress 64k PAGE_SIZE warning */ if (wLength > PAGE_SIZE) return -EINVAL; ret = usbfs_increase_memory_usage(PAGE_SIZE + sizeof(struct urb) + sizeof(struct usb_ctrlrequest)); if (ret) return ret; ret = -ENOMEM; tbuf = (unsigned char *)__get_free_page(GFP_KERNEL); if (!tbuf) goto done; urb = usb_alloc_urb(0, GFP_NOIO); if (!urb) goto done; dr = kmalloc(sizeof(struct usb_ctrlrequest), GFP_NOIO); if (!dr) goto done; dr->bRequestType = ctrl->bRequestType; dr->bRequest = ctrl->bRequest; dr->wValue = cpu_to_le16(ctrl->wValue); dr->wIndex = cpu_to_le16(ctrl->wIndex); dr->wLength = cpu_to_le16(ctrl->wLength); tmo = ctrl->timeout; snoop(&dev->dev, "control urb: bRequestType=%02x " "bRequest=%02x wValue=%04x " "wIndex=%04x wLength=%04x\n", ctrl->bRequestType, ctrl->bRequest, ctrl->wValue, ctrl->wIndex, ctrl->wLength); if ((ctrl->bRequestType & USB_DIR_IN) && wLength) { pipe = usb_rcvctrlpipe(dev, 0); usb_fill_control_urb(urb, dev, pipe, (unsigned char *) dr, tbuf, wLength, NULL, NULL); snoop_urb(dev, NULL, pipe, wLength, tmo, SUBMIT, NULL, 0); usb_unlock_device(dev); i = usbfs_start_wait_urb(urb, tmo, &actlen); /* Linger a bit, prior to the next control message. */ if (dev->quirks & USB_QUIRK_DELAY_CTRL_MSG) msleep(200); usb_lock_device(dev); snoop_urb(dev, NULL, pipe, actlen, i, COMPLETE, tbuf, actlen); if (!i && actlen) { if (copy_to_user(ctrl->data, tbuf, actlen)) { ret = -EFAULT; goto done; } } } else { if (wLength) { if (copy_from_user(tbuf, ctrl->data, wLength)) { ret = -EFAULT; goto done; } } pipe = usb_sndctrlpipe(dev, 0); usb_fill_control_urb(urb, dev, pipe, (unsigned char *) dr, tbuf, wLength, NULL, NULL); snoop_urb(dev, NULL, pipe, wLength, tmo, SUBMIT, tbuf, wLength); usb_unlock_device(dev); i = usbfs_start_wait_urb(urb, tmo, &actlen); /* Linger a bit, prior to the next control message. */ if (dev->quirks & USB_QUIRK_DELAY_CTRL_MSG) msleep(200); usb_lock_device(dev); snoop_urb(dev, NULL, pipe, actlen, i, COMPLETE, NULL, 0); } if (i < 0 && i != -EPIPE) { dev_printk(KERN_DEBUG, &dev->dev, "usbfs: USBDEVFS_CONTROL " "failed cmd %s rqt %u rq %u len %u ret %d\n", current->comm, ctrl->bRequestType, ctrl->bRequest, ctrl->wLength, i); } ret = (i < 0 ? i : actlen); done: kfree(dr); usb_free_urb(urb); free_page((unsigned long) tbuf); usbfs_decrease_memory_usage(PAGE_SIZE + sizeof(struct urb) + sizeof(struct usb_ctrlrequest)); return ret; } static int proc_control(struct usb_dev_state *ps, void __user *arg) { struct usbdevfs_ctrltransfer ctrl; if (copy_from_user(&ctrl, arg, sizeof(ctrl))) return -EFAULT; return do_proc_control(ps, &ctrl); } static int do_proc_bulk(struct usb_dev_state *ps, struct usbdevfs_bulktransfer *bulk) { struct usb_device *dev = ps->dev; unsigned int tmo, len1, len2, pipe; unsigned char *tbuf; int i, ret; struct urb *urb = NULL; struct usb_host_endpoint *ep; ret = findintfep(ps->dev, bulk->ep); if (ret < 0) return ret; ret = checkintf(ps, ret); if (ret) return ret; len1 = bulk->len; if (len1 < 0 || len1 >= (INT_MAX - sizeof(struct urb))) return -EINVAL; if (bulk->ep & USB_DIR_IN) pipe = usb_rcvbulkpipe(dev, bulk->ep & 0x7f); else pipe = usb_sndbulkpipe(dev, bulk->ep & 0x7f); ep = usb_pipe_endpoint(dev, pipe); if (!ep || !usb_endpoint_maxp(&ep->desc)) return -EINVAL; ret = usbfs_increase_memory_usage(len1 + sizeof(struct urb)); if (ret) return ret; /* * len1 can be almost arbitrarily large. Don't WARN if it's * too big, just fail the request. */ ret = -ENOMEM; tbuf = kmalloc(len1, GFP_KERNEL | __GFP_NOWARN); if (!tbuf) goto done; urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) goto done; if ((ep->desc.bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_INT) { pipe = (pipe & ~(3 << 30)) | (PIPE_INTERRUPT << 30); usb_fill_int_urb(urb, dev, pipe, tbuf, len1, NULL, NULL, ep->desc.bInterval); } else { usb_fill_bulk_urb(urb, dev, pipe, tbuf, len1, NULL, NULL); } tmo = bulk->timeout; if (bulk->ep & 0x80) { snoop_urb(dev, NULL, pipe, len1, tmo, SUBMIT, NULL, 0); usb_unlock_device(dev); i = usbfs_start_wait_urb(urb, tmo, &len2); usb_lock_device(dev); snoop_urb(dev, NULL, pipe, len2, i, COMPLETE, tbuf, len2); if (!i && len2) { if (copy_to_user(bulk->data, tbuf, len2)) { ret = -EFAULT; goto done; } } } else { if (len1) { if (copy_from_user(tbuf, bulk->data, len1)) { ret = -EFAULT; goto done; } } snoop_urb(dev, NULL, pipe, len1, tmo, SUBMIT, tbuf, len1); usb_unlock_device(dev); i = usbfs_start_wait_urb(urb, tmo, &len2); usb_lock_device(dev); snoop_urb(dev, NULL, pipe, len2, i, COMPLETE, NULL, 0); } ret = (i < 0 ? i : len2); done: usb_free_urb(urb); kfree(tbuf); usbfs_decrease_memory_usage(len1 + sizeof(struct urb)); return ret; } static int proc_bulk(struct usb_dev_state *ps, void __user *arg) { struct usbdevfs_bulktransfer bulk; if (copy_from_user(&bulk, arg, sizeof(bulk))) return -EFAULT; return do_proc_bulk(ps, &bulk); } static void check_reset_of_active_ep(struct usb_device *udev, unsigned int epnum, char *ioctl_name) { struct usb_host_endpoint **eps; struct usb_host_endpoint *ep; eps = (epnum & USB_DIR_IN) ? udev->ep_in : udev->ep_out; ep = eps[epnum & 0x0f]; if (ep && !list_empty(&ep->urb_list)) dev_warn(&udev->dev, "Process %d (%s) called USBDEVFS_%s for active endpoint 0x%02x\n", task_pid_nr(current), current->comm, ioctl_name, epnum); } static int proc_resetep(struct usb_dev_state *ps, void __user *arg) { unsigned int ep; int ret; if (get_user(ep, (unsigned int __user *)arg)) return -EFAULT; ret = findintfep(ps->dev, ep); if (ret < 0) return ret; ret = checkintf(ps, ret); if (ret) return ret; check_reset_of_active_ep(ps->dev, ep, "RESETEP"); usb_reset_endpoint(ps->dev, ep); return 0; } static int proc_clearhalt(struct usb_dev_state *ps, void __user *arg) { unsigned int ep; int pipe; int ret; if (get_user(ep, (unsigned int __user *)arg)) return -EFAULT; ret = findintfep(ps->dev, ep); if (ret < 0) return ret; ret = checkintf(ps, ret); if (ret) return ret; check_reset_of_active_ep(ps->dev, ep, "CLEAR_HALT"); if (ep & USB_DIR_IN) pipe = usb_rcvbulkpipe(ps->dev, ep & 0x7f); else pipe = usb_sndbulkpipe(ps->dev, ep & 0x7f); return usb_clear_halt(ps->dev, pipe); } static int proc_getdriver(struct usb_dev_state *ps, void __user *arg) { struct usbdevfs_getdriver gd; struct usb_interface *intf; int ret; if (copy_from_user(&gd, arg, sizeof(gd))) return -EFAULT; intf = usb_ifnum_to_if(ps->dev, gd.interface); if (!intf || !intf->dev.driver) ret = -ENODATA; else { strscpy(gd.driver, intf->dev.driver->name, sizeof(gd.driver)); ret = (copy_to_user(arg, &gd, sizeof(gd)) ? -EFAULT : 0); } return ret; } static int proc_connectinfo(struct usb_dev_state *ps, void __user *arg) { struct usbdevfs_connectinfo ci; memset(&ci, 0, sizeof(ci)); ci.devnum = ps->dev->devnum; ci.slow = ps->dev->speed == USB_SPEED_LOW; if (copy_to_user(arg, &ci, sizeof(ci))) return -EFAULT; return 0; } static int proc_conninfo_ex(struct usb_dev_state *ps, void __user *arg, size_t size) { struct usbdevfs_conninfo_ex ci; struct usb_device *udev = ps->dev; if (size < sizeof(ci.size)) return -EINVAL; memset(&ci, 0, sizeof(ci)); ci.size = sizeof(ci); ci.busnum = udev->bus->busnum; ci.devnum = udev->devnum; ci.speed = udev->speed; while (udev && udev->portnum != 0) { if (++ci.num_ports <= ARRAY_SIZE(ci.ports)) ci.ports[ARRAY_SIZE(ci.ports) - ci.num_ports] = udev->portnum; udev = udev->parent; } if (ci.num_ports < ARRAY_SIZE(ci.ports)) memmove(&ci.ports[0], &ci.ports[ARRAY_SIZE(ci.ports) - ci.num_ports], ci.num_ports); if (copy_to_user(arg, &ci, min(sizeof(ci), size))) return -EFAULT; return 0; } static int proc_resetdevice(struct usb_dev_state *ps) { struct usb_host_config *actconfig = ps->dev->actconfig; struct usb_interface *interface; int i, number; /* Don't allow a device reset if the process has dropped the * privilege to do such things and any of the interfaces are * currently claimed. */ if (ps->privileges_dropped && actconfig) { for (i = 0; i < actconfig->desc.bNumInterfaces; ++i) { interface = actconfig->interface[i]; number = interface->cur_altsetting->desc.bInterfaceNumber; if (usb_interface_claimed(interface) && !test_bit(number, &ps->ifclaimed)) { dev_warn(&ps->dev->dev, "usbfs: interface %d claimed by %s while '%s' resets device\n", number, interface->dev.driver->name, current->comm); return -EACCES; } } } return usb_reset_device(ps->dev); } static int proc_setintf(struct usb_dev_state *ps, void __user *arg) { struct usbdevfs_setinterface setintf; int ret; if (copy_from_user(&setintf, arg, sizeof(setintf))) return -EFAULT; ret = checkintf(ps, setintf.interface); if (ret) return ret; destroy_async_on_interface(ps, setintf.interface); return usb_set_interface(ps->dev, setintf.interface, setintf.altsetting); } static int proc_setconfig(struct usb_dev_state *ps, void __user *arg) { int u; int status = 0; struct usb_host_config *actconfig; if (get_user(u, (int __user *)arg)) return -EFAULT; actconfig = ps->dev->actconfig; /* Don't touch the device if any interfaces are claimed. * It could interfere with other drivers' operations, and if * an interface is claimed by usbfs it could easily deadlock. */ if (actconfig) { int i; for (i = 0; i < actconfig->desc.bNumInterfaces; ++i) { if (usb_interface_claimed(actconfig->interface[i])) { dev_warn(&ps->dev->dev, "usbfs: interface %d claimed by %s " "while '%s' sets config #%d\n", actconfig->interface[i] ->cur_altsetting ->desc.bInterfaceNumber, actconfig->interface[i] ->dev.driver->name, current->comm, u); status = -EBUSY; break; } } } /* SET_CONFIGURATION is often abused as a "cheap" driver reset, * so avoid usb_set_configuration()'s kick to sysfs */ if (status == 0) { if (actconfig && actconfig->desc.bConfigurationValue == u) status = usb_reset_configuration(ps->dev); else status = usb_set_configuration(ps->dev, u); } return status; } static struct usb_memory * find_memory_area(struct usb_dev_state *ps, const struct usbdevfs_urb *uurb) { struct usb_memory *usbm = NULL, *iter; unsigned long flags; unsigned long uurb_start = (unsigned long)uurb->buffer; spin_lock_irqsave(&ps->lock, flags); list_for_each_entry(iter, &ps->memory_list, memlist) { if (uurb_start >= iter->vm_start && uurb_start < iter->vm_start + iter->size) { if (uurb->buffer_length > iter->vm_start + iter->size - uurb_start) { usbm = ERR_PTR(-EINVAL); } else { usbm = iter; usbm->urb_use_count++; } break; } } spin_unlock_irqrestore(&ps->lock, flags); return usbm; } static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb, struct usbdevfs_iso_packet_desc __user *iso_frame_desc, void __user *arg, sigval_t userurb_sigval) { struct usbdevfs_iso_packet_desc *isopkt = NULL; struct usb_host_endpoint *ep; struct async *as = NULL; struct usb_ctrlrequest *dr = NULL; unsigned int u, totlen, isofrmlen; int i, ret, num_sgs = 0, ifnum = -1; int number_of_packets = 0; unsigned int stream_id = 0; void *buf; bool is_in; bool allow_short = false; bool allow_zero = false; unsigned long mask = USBDEVFS_URB_SHORT_NOT_OK | USBDEVFS_URB_BULK_CONTINUATION | USBDEVFS_URB_NO_FSBR | USBDEVFS_URB_ZERO_PACKET | USBDEVFS_URB_NO_INTERRUPT; /* USBDEVFS_URB_ISO_ASAP is a special case */ if (uurb->type == USBDEVFS_URB_TYPE_ISO) mask |= USBDEVFS_URB_ISO_ASAP; if (uurb->flags & ~mask) return -EINVAL; if ((unsigned int)uurb->buffer_length >= USBFS_XFER_MAX) return -EINVAL; if (uurb->buffer_length > 0 && !uurb->buffer) return -EINVAL; if (!(uurb->type == USBDEVFS_URB_TYPE_CONTROL && (uurb->endpoint & ~USB_ENDPOINT_DIR_MASK) == 0)) { ifnum = findintfep(ps->dev, uurb->endpoint); if (ifnum < 0) return ifnum; ret = checkintf(ps, ifnum); if (ret) return ret; } ep = ep_to_host_endpoint(ps->dev, uurb->endpoint); if (!ep) return -ENOENT; is_in = (uurb->endpoint & USB_ENDPOINT_DIR_MASK) != 0; u = 0; switch (uurb->type) { case USBDEVFS_URB_TYPE_CONTROL: if (!usb_endpoint_xfer_control(&ep->desc)) return -EINVAL; /* min 8 byte setup packet */ if (uurb->buffer_length < 8) return -EINVAL; dr = kmalloc(sizeof(struct usb_ctrlrequest), GFP_KERNEL); if (!dr) return -ENOMEM; if (copy_from_user(dr, uurb->buffer, 8)) { ret = -EFAULT; goto error; } if (uurb->buffer_length < (le16_to_cpu(dr->wLength) + 8)) { ret = -EINVAL; goto error; } ret = check_ctrlrecip(ps, dr->bRequestType, dr->bRequest, le16_to_cpu(dr->wIndex)); if (ret) goto error; uurb->buffer_length = le16_to_cpu(dr->wLength); uurb->buffer += 8; if ((dr->bRequestType & USB_DIR_IN) && uurb->buffer_length) { is_in = true; uurb->endpoint |= USB_DIR_IN; } else { is_in = false; uurb->endpoint &= ~USB_DIR_IN; } if (is_in) allow_short = true; snoop(&ps->dev->dev, "control urb: bRequestType=%02x " "bRequest=%02x wValue=%04x " "wIndex=%04x wLength=%04x\n", dr->bRequestType, dr->bRequest, __le16_to_cpu(dr->wValue), __le16_to_cpu(dr->wIndex), __le16_to_cpu(dr->wLength)); u = sizeof(struct usb_ctrlrequest); break; case USBDEVFS_URB_TYPE_BULK: if (!is_in) allow_zero = true; else allow_short = true; switch (usb_endpoint_type(&ep->desc)) { case USB_ENDPOINT_XFER_CONTROL: case USB_ENDPOINT_XFER_ISOC: return -EINVAL; case USB_ENDPOINT_XFER_INT: /* allow single-shot interrupt transfers */ uurb->type = USBDEVFS_URB_TYPE_INTERRUPT; goto interrupt_urb; } num_sgs = DIV_ROUND_UP(uurb->buffer_length, USB_SG_SIZE); if (num_sgs == 1 || num_sgs > ps->dev->bus->sg_tablesize) num_sgs = 0; if (ep->streams) stream_id = uurb->stream_id; break; case USBDEVFS_URB_TYPE_INTERRUPT: if (!usb_endpoint_xfer_int(&ep->desc)) return -EINVAL; interrupt_urb: if (!is_in) allow_zero = true; else allow_short = true; break; case USBDEVFS_URB_TYPE_ISO: /* arbitrary limit */ if (uurb->number_of_packets < 1 || uurb->number_of_packets > 128) return -EINVAL; if (!usb_endpoint_xfer_isoc(&ep->desc)) return -EINVAL; number_of_packets = uurb->number_of_packets; isofrmlen = sizeof(struct usbdevfs_iso_packet_desc) * number_of_packets; isopkt = memdup_user(iso_frame_desc, isofrmlen); if (IS_ERR(isopkt)) { ret = PTR_ERR(isopkt); isopkt = NULL; goto error; } for (totlen = u = 0; u < number_of_packets; u++) { /* * arbitrary limit need for USB 3.1 Gen2 * sizemax: 96 DPs at SSP, 96 * 1024 = 98304 */ if (isopkt[u].length > 98304) { ret = -EINVAL; goto error; } totlen += isopkt[u].length; } u *= sizeof(struct usb_iso_packet_descriptor); uurb->buffer_length = totlen; break; default: return -EINVAL; } if (uurb->buffer_length > 0 && !access_ok(uurb->buffer, uurb->buffer_length)) { ret = -EFAULT; goto error; } as = alloc_async(number_of_packets); if (!as) { ret = -ENOMEM; goto error; } as->usbm = find_memory_area(ps, uurb); if (IS_ERR(as->usbm)) { ret = PTR_ERR(as->usbm); as->usbm = NULL; goto error; } /* do not use SG buffers when memory mapped segments * are in use */ if (as->usbm) num_sgs = 0; u += sizeof(struct async) + sizeof(struct urb) + (as->usbm ? 0 : uurb->buffer_length) + num_sgs * sizeof(struct scatterlist); ret = usbfs_increase_memory_usage(u); if (ret) goto error; as->mem_usage = u; if (num_sgs) { as->urb->sg = kmalloc_array(num_sgs, sizeof(struct scatterlist), GFP_KERNEL | __GFP_NOWARN); if (!as->urb->sg) { ret = -ENOMEM; goto error; } as->urb->num_sgs = num_sgs; sg_init_table(as->urb->sg, as->urb->num_sgs); totlen = uurb->buffer_length; for (i = 0; i < as->urb->num_sgs; i++) { u = (totlen > USB_SG_SIZE) ? USB_SG_SIZE : totlen; buf = kmalloc(u, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto error; } sg_set_buf(&as->urb->sg[i], buf, u); if (!is_in) { if (copy_from_user(buf, uurb->buffer, u)) { ret = -EFAULT; goto error; } uurb->buffer += u; } totlen -= u; } } else if (uurb->buffer_length > 0) { if (as->usbm) { unsigned long uurb_start = (unsigned long)uurb->buffer; as->urb->transfer_buffer = as->usbm->mem + (uurb_start - as->usbm->vm_start); } else { as->urb->transfer_buffer = kmalloc(uurb->buffer_length, GFP_KERNEL | __GFP_NOWARN); if (!as->urb->transfer_buffer) { ret = -ENOMEM; goto error; } if (!is_in) { if (copy_from_user(as->urb->transfer_buffer, uurb->buffer, uurb->buffer_length)) { ret = -EFAULT; goto error; } } else if (uurb->type == USBDEVFS_URB_TYPE_ISO) { /* * Isochronous input data may end up being * discontiguous if some of the packets are * short. Clear the buffer so that the gaps * don't leak kernel data to userspace. */ memset(as->urb->transfer_buffer, 0, uurb->buffer_length); } } } as->urb->dev = ps->dev; as->urb->pipe = (uurb->type << 30) | __create_pipe(ps->dev, uurb->endpoint & 0xf) | (uurb->endpoint & USB_DIR_IN); /* This tedious sequence is necessary because the URB_* flags * are internal to the kernel and subject to change, whereas * the USBDEVFS_URB_* flags are a user API and must not be changed. */ u = (is_in ? URB_DIR_IN : URB_DIR_OUT); if (uurb->flags & USBDEVFS_URB_ISO_ASAP) u |= URB_ISO_ASAP; if (allow_short && uurb->flags & USBDEVFS_URB_SHORT_NOT_OK) u |= URB_SHORT_NOT_OK; if (allow_zero && uurb->flags & USBDEVFS_URB_ZERO_PACKET) u |= URB_ZERO_PACKET; if (uurb->flags & USBDEVFS_URB_NO_INTERRUPT) u |= URB_NO_INTERRUPT; as->urb->transfer_flags = u; if (!allow_short && uurb->flags & USBDEVFS_URB_SHORT_NOT_OK) dev_warn(&ps->dev->dev, "Requested nonsensical USBDEVFS_URB_SHORT_NOT_OK.\n"); if (!allow_zero && uurb->flags & USBDEVFS_URB_ZERO_PACKET) dev_warn(&ps->dev->dev, "Requested nonsensical USBDEVFS_URB_ZERO_PACKET.\n"); as->urb->transfer_buffer_length = uurb->buffer_length; as->urb->setup_packet = (unsigned char *)dr; dr = NULL; as->urb->start_frame = uurb->start_frame; as->urb->number_of_packets = number_of_packets; as->urb->stream_id = stream_id; if (ep->desc.bInterval) { if (uurb->type == USBDEVFS_URB_TYPE_ISO || ps->dev->speed == USB_SPEED_HIGH || ps->dev->speed >= USB_SPEED_SUPER) as->urb->interval = 1 << min(15, ep->desc.bInterval - 1); else as->urb->interval = ep->desc.bInterval; } as->urb->context = as; as->urb->complete = async_completed; for (totlen = u = 0; u < number_of_packets; u++) { as->urb->iso_frame_desc[u].offset = totlen; as->urb->iso_frame_desc[u].length = isopkt[u].length; totlen += isopkt[u].length; } kfree(isopkt); isopkt = NULL; as->ps = ps; as->userurb = arg; as->userurb_sigval = userurb_sigval; if (as->usbm) { unsigned long uurb_start = (unsigned long)uurb->buffer; as->urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; as->urb->transfer_dma = as->usbm->dma_handle + (uurb_start - as->usbm->vm_start); } else if (is_in && uurb->buffer_length > 0) as->userbuffer = uurb->buffer; as->signr = uurb->signr; as->ifnum = ifnum; as->pid = get_pid(task_pid(current)); as->cred = get_current_cred(); snoop_urb(ps->dev, as->userurb, as->urb->pipe, as->urb->transfer_buffer_length, 0, SUBMIT, NULL, 0); if (!is_in) snoop_urb_data(as->urb, as->urb->transfer_buffer_length); async_newpending(as); if (usb_endpoint_xfer_bulk(&ep->desc)) { spin_lock_irq(&ps->lock); /* Not exactly the endpoint address; the direction bit is * shifted to the 0x10 position so that the value will be * between 0 and 31. */ as->bulk_addr = usb_endpoint_num(&ep->desc) | ((ep->desc.bEndpointAddress & USB_ENDPOINT_DIR_MASK) >> 3); /* If this bulk URB is the start of a new transfer, re-enable * the endpoint. Otherwise mark it as a continuation URB. */ if (uurb->flags & USBDEVFS_URB_BULK_CONTINUATION) as->bulk_status = AS_CONTINUATION; else ps->disabled_bulk_eps &= ~(1 << as->bulk_addr); /* Don't accept continuation URBs if the endpoint is * disabled because of an earlier error. */ if (ps->disabled_bulk_eps & (1 << as->bulk_addr)) ret = -EREMOTEIO; else ret = usb_submit_urb(as->urb, GFP_ATOMIC); spin_unlock_irq(&ps->lock); } else { ret = usb_submit_urb(as->urb, GFP_KERNEL); } if (ret) { dev_printk(KERN_DEBUG, &ps->dev->dev, "usbfs: usb_submit_urb returned %d\n", ret); snoop_urb(ps->dev, as->userurb, as->urb->pipe, 0, ret, COMPLETE, NULL, 0); async_removepending(as); goto error; } return 0; error: kfree(isopkt); kfree(dr); if (as) free_async(as); return ret; } static int proc_submiturb(struct usb_dev_state *ps, void __user *arg) { struct usbdevfs_urb uurb; sigval_t userurb_sigval; if (copy_from_user(&uurb, arg, sizeof(uurb))) return -EFAULT; memset(&userurb_sigval, 0, sizeof(userurb_sigval)); userurb_sigval.sival_ptr = arg; return proc_do_submiturb(ps, &uurb, (((struct usbdevfs_urb __user *)arg)->iso_frame_desc), arg, userurb_sigval); } static int proc_unlinkurb(struct usb_dev_state *ps, void __user *arg) { struct urb *urb; struct async *as; unsigned long flags; spin_lock_irqsave(&ps->lock, flags); as = async_getpending(ps, arg); if (!as) { spin_unlock_irqrestore(&ps->lock, flags); return -EINVAL; } urb = as->urb; usb_get_urb(urb); spin_unlock_irqrestore(&ps->lock, flags); usb_kill_urb(urb); usb_put_urb(urb); return 0; } static void compute_isochronous_actual_length(struct urb *urb) { unsigned int i; if (urb->number_of_packets > 0) { urb->actual_length = 0; for (i = 0; i < urb->number_of_packets; i++) urb->actual_length += urb->iso_frame_desc[i].actual_length; } } static int processcompl(struct async *as, void __user * __user *arg) { struct urb *urb = as->urb; struct usbdevfs_urb __user *userurb = as->userurb; void __user *addr = as->userurb; unsigned int i; compute_isochronous_actual_length(urb); if (as->userbuffer && urb->actual_length) { if (copy_urb_data_to_user(as->userbuffer, urb)) goto err_out; } if (put_user(as->status, &userurb->status)) goto err_out; if (put_user(urb->actual_length, &userurb->actual_length)) goto err_out; if (put_user(urb->error_count, &userurb->error_count)) goto err_out; if (usb_endpoint_xfer_isoc(&urb->ep->desc)) { for (i = 0; i < urb->number_of_packets; i++) { if (put_user(urb->iso_frame_desc[i].actual_length, &userurb->iso_frame_desc[i].actual_length)) goto err_out; if (put_user(urb->iso_frame_desc[i].status, &userurb->iso_frame_desc[i].status)) goto err_out; } } if (put_user(addr, (void __user * __user *)arg)) return -EFAULT; return 0; err_out: return -EFAULT; } static struct async *reap_as(struct usb_dev_state *ps) { DECLARE_WAITQUEUE(wait, current); struct async *as = NULL; struct usb_device *dev = ps->dev; add_wait_queue(&ps->wait, &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); as = async_getcompleted(ps); if (as || !connected(ps)) break; if (signal_pending(current)) break; usb_unlock_device(dev); schedule(); usb_lock_device(dev); } remove_wait_queue(&ps->wait, &wait); set_current_state(TASK_RUNNING); return as; } static int proc_reapurb(struct usb_dev_state *ps, void __user *arg) { struct async *as = reap_as(ps); if (as) { int retval; snoop(&ps->dev->dev, "reap %px\n", as->userurb); retval = processcompl(as, (void __user * __user *)arg); free_async(as); return retval; } if (signal_pending(current)) return -EINTR; return -ENODEV; } static int proc_reapurbnonblock(struct usb_dev_state *ps, void __user *arg) { int retval; struct async *as; as = async_getcompleted(ps); if (as) { snoop(&ps->dev->dev, "reap %px\n", as->userurb); retval = processcompl(as, (void __user * __user *)arg); free_async(as); } else { retval = (connected(ps) ? -EAGAIN : -ENODEV); } return retval; } #ifdef CONFIG_COMPAT static int proc_control_compat(struct usb_dev_state *ps, struct usbdevfs_ctrltransfer32 __user *p32) { struct usbdevfs_ctrltransfer ctrl; u32 udata; if (copy_from_user(&ctrl, p32, sizeof(*p32) - sizeof(compat_caddr_t)) || get_user(udata, &p32->data)) return -EFAULT; ctrl.data = compat_ptr(udata); return do_proc_control(ps, &ctrl); } static int proc_bulk_compat(struct usb_dev_state *ps, struct usbdevfs_bulktransfer32 __user *p32) { struct usbdevfs_bulktransfer bulk; compat_caddr_t addr; if (get_user(bulk.ep, &p32->ep) || get_user(bulk.len, &p32->len) || get_user(bulk.timeout, &p32->timeout) || get_user(addr, &p32->data)) return -EFAULT; bulk.data = compat_ptr(addr); return do_proc_bulk(ps, &bulk); } static int proc_disconnectsignal_compat(struct usb_dev_state *ps, void __user *arg) { struct usbdevfs_disconnectsignal32 ds; if (copy_from_user(&ds, arg, sizeof(ds))) return -EFAULT; ps->discsignr = ds.signr; ps->disccontext.sival_int = ds.context; return 0; } static int get_urb32(struct usbdevfs_urb *kurb, struct usbdevfs_urb32 __user *uurb) { struct usbdevfs_urb32 urb32; if (copy_from_user(&urb32, uurb, sizeof(*uurb))) return -EFAULT; kurb->type = urb32.type; kurb->endpoint = urb32.endpoint; kurb->status = urb32.status; kurb->flags = urb32.flags; kurb->buffer = compat_ptr(urb32.buffer); kurb->buffer_length = urb32.buffer_length; kurb->actual_length = urb32.actual_length; kurb->start_frame = urb32.start_frame; kurb->number_of_packets = urb32.number_of_packets; kurb->error_count = urb32.error_count; kurb->signr = urb32.signr; kurb->usercontext = compat_ptr(urb32.usercontext); return 0; } static int proc_submiturb_compat(struct usb_dev_state *ps, void __user *arg) { struct usbdevfs_urb uurb; sigval_t userurb_sigval; if (get_urb32(&uurb, (struct usbdevfs_urb32 __user *)arg)) return -EFAULT; memset(&userurb_sigval, 0, sizeof(userurb_sigval)); userurb_sigval.sival_int = ptr_to_compat(arg); return proc_do_submiturb(ps, &uurb, ((struct usbdevfs_urb32 __user *)arg)->iso_frame_desc, arg, userurb_sigval); } static int processcompl_compat(struct async *as, void __user * __user *arg) { struct urb *urb = as->urb; struct usbdevfs_urb32 __user *userurb = as->userurb; void __user *addr = as->userurb; unsigned int i; compute_isochronous_actual_length(urb); if (as->userbuffer && urb->actual_length) { if (copy_urb_data_to_user(as->userbuffer, urb)) return -EFAULT; } if (put_user(as->status, &userurb->status)) return -EFAULT; if (put_user(urb->actual_length, &userurb->actual_length)) return -EFAULT; if (put_user(urb->error_count, &userurb->error_count)) return -EFAULT; if (usb_endpoint_xfer_isoc(&urb->ep->desc)) { for (i = 0; i < urb->number_of_packets; i++) { if (put_user(urb->iso_frame_desc[i].actual_length, &userurb->iso_frame_desc[i].actual_length)) return -EFAULT; if (put_user(urb->iso_frame_desc[i].status, &userurb->iso_frame_desc[i].status)) return -EFAULT; } } if (put_user(ptr_to_compat(addr), (u32 __user *)arg)) return -EFAULT; return 0; } static int proc_reapurb_compat(struct usb_dev_state *ps, void __user *arg) { struct async *as = reap_as(ps); if (as) { int retval; snoop(&ps->dev->dev, "reap %px\n", as->userurb); retval = processcompl_compat(as, (void __user * __user *)arg); free_async(as); return retval; } if (signal_pending(current)) return -EINTR; return -ENODEV; } static int proc_reapurbnonblock_compat(struct usb_dev_state *ps, void __user *arg) { int retval; struct async *as; as = async_getcompleted(ps); if (as) { snoop(&ps->dev->dev, "reap %px\n", as->userurb); retval = processcompl_compat(as, (void __user * __user *)arg); free_async(as); } else { retval = (connected(ps) ? -EAGAIN : -ENODEV); } return retval; } #endif static int proc_disconnectsignal(struct usb_dev_state *ps, void __user *arg) { struct usbdevfs_disconnectsignal ds; if (copy_from_user(&ds, arg, sizeof(ds))) return -EFAULT; ps->discsignr = ds.signr; ps->disccontext.sival_ptr = ds.context; return 0; } static int proc_claiminterface(struct usb_dev_state *ps, void __user *arg) { unsigned int ifnum; if (get_user(ifnum, (unsigned int __user *)arg)) return -EFAULT; return claimintf(ps, ifnum); } static int proc_releaseinterface(struct usb_dev_state *ps, void __user *arg) { unsigned int ifnum; int ret; if (get_user(ifnum, (unsigned int __user *)arg)) return -EFAULT; ret = releaseintf(ps, ifnum); if (ret < 0) return ret; destroy_async_on_interface(ps, ifnum); return 0; } static int proc_ioctl(struct usb_dev_state *ps, struct usbdevfs_ioctl *ctl) { int size; void *buf = NULL; int retval = 0; struct usb_interface *intf = NULL; struct usb_driver *driver = NULL; if (ps->privileges_dropped) return -EACCES; if (!connected(ps)) return -ENODEV; /* alloc buffer */ size = _IOC_SIZE(ctl->ioctl_code); if (size > 0) { buf = kmalloc(size, GFP_KERNEL); if (buf == NULL) return -ENOMEM; if ((_IOC_DIR(ctl->ioctl_code) & _IOC_WRITE)) { if (copy_from_user(buf, ctl->data, size)) { kfree(buf); return -EFAULT; } } else { memset(buf, 0, size); } } if (ps->dev->state != USB_STATE_CONFIGURED) retval = -EHOSTUNREACH; else if (!(intf = usb_ifnum_to_if(ps->dev, ctl->ifno))) retval = -EINVAL; else switch (ctl->ioctl_code) { /* disconnect kernel driver from interface */ case USBDEVFS_DISCONNECT: if (intf->dev.driver) { driver = to_usb_driver(intf->dev.driver); dev_dbg(&intf->dev, "disconnect by usbfs\n"); usb_driver_release_interface(driver, intf); } else retval = -ENODATA; break; /* let kernel drivers try to (re)bind to the interface */ case USBDEVFS_CONNECT: if (!intf->dev.driver) retval = device_attach(&intf->dev); else retval = -EBUSY; break; /* talk directly to the interface's driver */ default: if (intf->dev.driver) driver = to_usb_driver(intf->dev.driver); if (driver == NULL || driver->unlocked_ioctl == NULL) { retval = -ENOTTY; } else { retval = driver->unlocked_ioctl(intf, ctl->ioctl_code, buf); if (retval == -ENOIOCTLCMD) retval = -ENOTTY; } } /* cleanup and return */ if (retval >= 0 && (_IOC_DIR(ctl->ioctl_code) & _IOC_READ) != 0 && size > 0 && copy_to_user(ctl->data, buf, size) != 0) retval = -EFAULT; kfree(buf); return retval; } static int proc_ioctl_default(struct usb_dev_state *ps, void __user *arg) { struct usbdevfs_ioctl ctrl; if (copy_from_user(&ctrl, arg, sizeof(ctrl))) return -EFAULT; return proc_ioctl(ps, &ctrl); } #ifdef CONFIG_COMPAT static int proc_ioctl_compat(struct usb_dev_state *ps, compat_uptr_t arg) { struct usbdevfs_ioctl32 ioc32; struct usbdevfs_ioctl ctrl; if (copy_from_user(&ioc32, compat_ptr(arg), sizeof(ioc32))) return -EFAULT; ctrl.ifno = ioc32.ifno; ctrl.ioctl_code = ioc32.ioctl_code; ctrl.data = compat_ptr(ioc32.data); return proc_ioctl(ps, &ctrl); } #endif static int proc_claim_port(struct usb_dev_state *ps, void __user *arg) { unsigned portnum; int rc; if (get_user(portnum, (unsigned __user *) arg)) return -EFAULT; rc = usb_hub_claim_port(ps->dev, portnum, ps); if (rc == 0) snoop(&ps->dev->dev, "port %d claimed by process %d: %s\n", portnum, task_pid_nr(current), current->comm); return rc; } static int proc_release_port(struct usb_dev_state *ps, void __user *arg) { unsigned portnum; if (get_user(portnum, (unsigned __user *) arg)) return -EFAULT; return usb_hub_release_port(ps->dev, portnum, ps); } static int proc_get_capabilities(struct usb_dev_state *ps, void __user *arg) { __u32 caps; caps = USBDEVFS_CAP_ZERO_PACKET | USBDEVFS_CAP_NO_PACKET_SIZE_LIM | USBDEVFS_CAP_REAP_AFTER_DISCONNECT | USBDEVFS_CAP_MMAP | USBDEVFS_CAP_DROP_PRIVILEGES | USBDEVFS_CAP_CONNINFO_EX | MAYBE_CAP_SUSPEND; if (!ps->dev->bus->no_stop_on_short) caps |= USBDEVFS_CAP_BULK_CONTINUATION; if (ps->dev->bus->sg_tablesize) caps |= USBDEVFS_CAP_BULK_SCATTER_GATHER; if (put_user(caps, (__u32 __user *)arg)) return -EFAULT; return 0; } static int proc_disconnect_claim(struct usb_dev_state *ps, void __user *arg) { struct usbdevfs_disconnect_claim dc; struct usb_interface *intf; if (copy_from_user(&dc, arg, sizeof(dc))) return -EFAULT; intf = usb_ifnum_to_if(ps->dev, dc.interface); if (!intf) return -EINVAL; if (intf->dev.driver) { struct usb_driver *driver = to_usb_driver(intf->dev.driver); if (ps->privileges_dropped) return -EACCES; if ((dc.flags & USBDEVFS_DISCONNECT_CLAIM_IF_DRIVER) && strncmp(dc.driver, intf->dev.driver->name, sizeof(dc.driver)) != 0) return -EBUSY; if ((dc.flags & USBDEVFS_DISCONNECT_CLAIM_EXCEPT_DRIVER) && strncmp(dc.driver, intf->dev.driver->name, sizeof(dc.driver)) == 0) return -EBUSY; dev_dbg(&intf->dev, "disconnect by usbfs\n"); usb_driver_release_interface(driver, intf); } return claimintf(ps, dc.interface); } static int proc_alloc_streams(struct usb_dev_state *ps, void __user *arg) { unsigned num_streams, num_eps; struct usb_host_endpoint **eps; struct usb_interface *intf; int r; r = parse_usbdevfs_streams(ps, arg, &num_streams, &num_eps, &eps, &intf); if (r) return r; destroy_async_on_interface(ps, intf->altsetting[0].desc.bInterfaceNumber); r = usb_alloc_streams(intf, eps, num_eps, num_streams, GFP_KERNEL); kfree(eps); return r; } static int proc_free_streams(struct usb_dev_state *ps, void __user *arg) { unsigned num_eps; struct usb_host_endpoint **eps; struct usb_interface *intf; int r; r = parse_usbdevfs_streams(ps, arg, NULL, &num_eps, &eps, &intf); if (r) return r; destroy_async_on_interface(ps, intf->altsetting[0].desc.bInterfaceNumber); r = usb_free_streams(intf, eps, num_eps, GFP_KERNEL); kfree(eps); return r; } static int proc_drop_privileges(struct usb_dev_state *ps, void __user *arg) { u32 data; if (copy_from_user(&data, arg, sizeof(data))) return -EFAULT; /* This is a one way operation. Once privileges are * dropped, you cannot regain them. You may however reissue * this ioctl to shrink the allowed interfaces mask. */ ps->interface_allowed_mask &= data; ps->privileges_dropped = true; return 0; } static int proc_forbid_suspend(struct usb_dev_state *ps) { int ret = 0; if (ps->suspend_allowed) { ret = usb_autoresume_device(ps->dev); if (ret == 0) ps->suspend_allowed = false; else if (ret != -ENODEV) ret = -EIO; } return ret; } static int proc_allow_suspend(struct usb_dev_state *ps) { if (!connected(ps)) return -ENODEV; WRITE_ONCE(ps->not_yet_resumed, 1); if (!ps->suspend_allowed) { usb_autosuspend_device(ps->dev); ps->suspend_allowed = true; } return 0; } static int proc_wait_for_resume(struct usb_dev_state *ps) { int ret; usb_unlock_device(ps->dev); ret = wait_event_interruptible(ps->wait_for_resume, READ_ONCE(ps->not_yet_resumed) == 0); usb_lock_device(ps->dev); if (ret != 0) return -EINTR; return proc_forbid_suspend(ps); } /* * NOTE: All requests here that have interface numbers as parameters * are assuming that somehow the configuration has been prevented from * changing. But there's no mechanism to ensure that... */ static long usbdev_do_ioctl(struct file *file, unsigned int cmd, void __user *p) { struct usb_dev_state *ps = file->private_data; struct inode *inode = file_inode(file); struct usb_device *dev = ps->dev; int ret = -ENOTTY; if (!(file->f_mode & FMODE_WRITE)) return -EPERM; usb_lock_device(dev); /* Reap operations are allowed even after disconnection */ switch (cmd) { case USBDEVFS_REAPURB: snoop(&dev->dev, "%s: REAPURB\n", __func__); ret = proc_reapurb(ps, p); goto done; case USBDEVFS_REAPURBNDELAY: snoop(&dev->dev, "%s: REAPURBNDELAY\n", __func__); ret = proc_reapurbnonblock(ps, p); goto done; #ifdef CONFIG_COMPAT case USBDEVFS_REAPURB32: snoop(&dev->dev, "%s: REAPURB32\n", __func__); ret = proc_reapurb_compat(ps, p); goto done; case USBDEVFS_REAPURBNDELAY32: snoop(&dev->dev, "%s: REAPURBNDELAY32\n", __func__); ret = proc_reapurbnonblock_compat(ps, p); goto done; #endif } if (!connected(ps)) { usb_unlock_device(dev); return -ENODEV; } switch (cmd) { case USBDEVFS_CONTROL: snoop(&dev->dev, "%s: CONTROL\n", __func__); ret = proc_control(ps, p); if (ret >= 0) inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); break; case USBDEVFS_BULK: snoop(&dev->dev, "%s: BULK\n", __func__); ret = proc_bulk(ps, p); if (ret >= 0) inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); break; case USBDEVFS_RESETEP: snoop(&dev->dev, "%s: RESETEP\n", __func__); ret = proc_resetep(ps, p); if (ret >= 0) inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); break; case USBDEVFS_RESET: snoop(&dev->dev, "%s: RESET\n", __func__); ret = proc_resetdevice(ps); break; case USBDEVFS_CLEAR_HALT: snoop(&dev->dev, "%s: CLEAR_HALT\n", __func__); ret = proc_clearhalt(ps, p); if (ret >= 0) inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); break; case USBDEVFS_GETDRIVER: snoop(&dev->dev, "%s: GETDRIVER\n", __func__); ret = proc_getdriver(ps, p); break; case USBDEVFS_CONNECTINFO: snoop(&dev->dev, "%s: CONNECTINFO\n", __func__); ret = proc_connectinfo(ps, p); break; case USBDEVFS_SETINTERFACE: snoop(&dev->dev, "%s: SETINTERFACE\n", __func__); ret = proc_setintf(ps, p); break; case USBDEVFS_SETCONFIGURATION: snoop(&dev->dev, "%s: SETCONFIGURATION\n", __func__); ret = proc_setconfig(ps, p); break; case USBDEVFS_SUBMITURB: snoop(&dev->dev, "%s: SUBMITURB\n", __func__); ret = proc_submiturb(ps, p); if (ret >= 0) inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); break; #ifdef CONFIG_COMPAT case USBDEVFS_CONTROL32: snoop(&dev->dev, "%s: CONTROL32\n", __func__); ret = proc_control_compat(ps, p); if (ret >= 0) inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); break; case USBDEVFS_BULK32: snoop(&dev->dev, "%s: BULK32\n", __func__); ret = proc_bulk_compat(ps, p); if (ret >= 0) inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); break; case USBDEVFS_DISCSIGNAL32: snoop(&dev->dev, "%s: DISCSIGNAL32\n", __func__); ret = proc_disconnectsignal_compat(ps, p); break; case USBDEVFS_SUBMITURB32: snoop(&dev->dev, "%s: SUBMITURB32\n", __func__); ret = proc_submiturb_compat(ps, p); if (ret >= 0) inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); break; case USBDEVFS_IOCTL32: snoop(&dev->dev, "%s: IOCTL32\n", __func__); ret = proc_ioctl_compat(ps, ptr_to_compat(p)); break; #endif case USBDEVFS_DISCARDURB: snoop(&dev->dev, "%s: DISCARDURB %px\n", __func__, p); ret = proc_unlinkurb(ps, p); break; case USBDEVFS_DISCSIGNAL: snoop(&dev->dev, "%s: DISCSIGNAL\n", __func__); ret = proc_disconnectsignal(ps, p); break; case USBDEVFS_CLAIMINTERFACE: snoop(&dev->dev, "%s: CLAIMINTERFACE\n", __func__); ret = proc_claiminterface(ps, p); break; case USBDEVFS_RELEASEINTERFACE: snoop(&dev->dev, "%s: RELEASEINTERFACE\n", __func__); ret = proc_releaseinterface(ps, p); break; case USBDEVFS_IOCTL: snoop(&dev->dev, "%s: IOCTL\n", __func__); ret = proc_ioctl_default(ps, p); break; case USBDEVFS_CLAIM_PORT: snoop(&dev->dev, "%s: CLAIM_PORT\n", __func__); ret = proc_claim_port(ps, p); break; case USBDEVFS_RELEASE_PORT: snoop(&dev->dev, "%s: RELEASE_PORT\n", __func__); ret = proc_release_port(ps, p); break; case USBDEVFS_GET_CAPABILITIES: ret = proc_get_capabilities(ps, p); break; case USBDEVFS_DISCONNECT_CLAIM: ret = proc_disconnect_claim(ps, p); break; case USBDEVFS_ALLOC_STREAMS: ret = proc_alloc_streams(ps, p); break; case USBDEVFS_FREE_STREAMS: ret = proc_free_streams(ps, p); break; case USBDEVFS_DROP_PRIVILEGES: ret = proc_drop_privileges(ps, p); break; case USBDEVFS_GET_SPEED: ret = ps->dev->speed; break; case USBDEVFS_FORBID_SUSPEND: ret = proc_forbid_suspend(ps); break; case USBDEVFS_ALLOW_SUSPEND: ret = proc_allow_suspend(ps); break; case USBDEVFS_WAIT_FOR_RESUME: ret = proc_wait_for_resume(ps); break; } /* Handle variable-length commands */ switch (cmd & ~IOCSIZE_MASK) { case USBDEVFS_CONNINFO_EX(0): ret = proc_conninfo_ex(ps, p, _IOC_SIZE(cmd)); break; } done: usb_unlock_device(dev); if (ret >= 0) inode_set_atime_to_ts(inode, current_time(inode)); return ret; } static long usbdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { int ret; ret = usbdev_do_ioctl(file, cmd, (void __user *)arg); return ret; } /* No kernel lock - fine */ static __poll_t usbdev_poll(struct file *file, struct poll_table_struct *wait) { struct usb_dev_state *ps = file->private_data; __poll_t mask = 0; poll_wait(file, &ps->wait, wait); if (file->f_mode & FMODE_WRITE && !list_empty(&ps->async_completed)) mask |= EPOLLOUT | EPOLLWRNORM; if (!connected(ps)) mask |= EPOLLHUP; if (list_empty(&ps->list)) mask |= EPOLLERR; return mask; } const struct file_operations usbdev_file_operations = { .owner = THIS_MODULE, .llseek = no_seek_end_llseek, .read = usbdev_read, .poll = usbdev_poll, .unlocked_ioctl = usbdev_ioctl, .compat_ioctl = compat_ptr_ioctl, .mmap = usbdev_mmap, .open = usbdev_open, .release = usbdev_release, }; static void usbdev_remove(struct usb_device *udev) { struct usb_dev_state *ps; /* Protect against simultaneous resume */ mutex_lock(&usbfs_mutex); while (!list_empty(&udev->filelist)) { ps = list_entry(udev->filelist.next, struct usb_dev_state, list); destroy_all_async(ps); wake_up_all(&ps->wait); WRITE_ONCE(ps->not_yet_resumed, 0); wake_up_all(&ps->wait_for_resume); list_del_init(&ps->list); if (ps->discsignr) kill_pid_usb_asyncio(ps->discsignr, EPIPE, ps->disccontext, ps->disc_pid, ps->cred); } mutex_unlock(&usbfs_mutex); } static int usbdev_notify(struct notifier_block *self, unsigned long action, void *dev) { switch (action) { case USB_DEVICE_ADD: break; case USB_DEVICE_REMOVE: usbdev_remove(dev); break; } return NOTIFY_OK; } static struct notifier_block usbdev_nb = { .notifier_call = usbdev_notify, }; static struct cdev usb_device_cdev; int __init usb_devio_init(void) { int retval; retval = register_chrdev_region(USB_DEVICE_DEV, USB_DEVICE_MAX, "usb_device"); if (retval) { printk(KERN_ERR "Unable to register minors for usb_device\n"); goto out; } cdev_init(&usb_device_cdev, &usbdev_file_operations); retval = cdev_add(&usb_device_cdev, USB_DEVICE_DEV, USB_DEVICE_MAX); if (retval) { printk(KERN_ERR "Unable to get usb_device major %d\n", USB_DEVICE_MAJOR); goto error_cdev; } usb_register_notify(&usbdev_nb); out: return retval; error_cdev: unregister_chrdev_region(USB_DEVICE_DEV, USB_DEVICE_MAX); goto out; } void usb_devio_cleanup(void) { usb_unregister_notify(&usbdev_nb); cdev_del(&usb_device_cdev); unregister_chrdev_region(USB_DEVICE_DEV, USB_DEVICE_MAX); }
1 1 79 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 5 5 5 5 29 18 29 1 1 3 3 2 3 79 79 79 79 79 79 79 79 79 79 79 78 78 78 78 78 78 78 78 3 66 66 66 66 66 29 29 29 29 34 28 34 79 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 // SPDX-License-Identifier: GPL-2.0 /* * inode.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004,2019 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. * Copyright (C) 2019 Linux Foundation <gregkh@linuxfoundation.org> * * debugfs is for people to use instead of /proc or /sys. * See ./Documentation/core-api/kernel-api.rst for more details. */ #define pr_fmt(fmt) "debugfs: " fmt #include <linux/module.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/kobject.h> #include <linux/namei.h> #include <linux/debugfs.h> #include <linux/fsnotify.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/security.h> #include "internal.h" #define DEBUGFS_DEFAULT_MODE 0700 static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS; /* * Don't allow access attributes to be changed whilst the kernel is locked down * so that we can use the file mode as part of a heuristic to determine whether * to lock down individual files. */ static int debugfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *ia) { int ret; if (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) { ret = security_locked_down(LOCKDOWN_DEBUGFS); if (ret) return ret; } return simple_setattr(&nop_mnt_idmap, dentry, ia); } static const struct inode_operations debugfs_file_inode_operations = { .setattr = debugfs_setattr, }; static const struct inode_operations debugfs_dir_inode_operations = { .lookup = simple_lookup, .setattr = debugfs_setattr, }; static const struct inode_operations debugfs_symlink_inode_operations = { .get_link = simple_get_link, .setattr = debugfs_setattr, }; static struct inode *debugfs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); simple_inode_init_ts(inode); } return inode; } struct debugfs_fs_info { kuid_t uid; kgid_t gid; umode_t mode; /* Opt_* bitfield. */ unsigned int opts; }; enum { Opt_uid, Opt_gid, Opt_mode, }; static const struct fs_parameter_spec debugfs_param_specs[] = { fsparam_u32 ("gid", Opt_gid), fsparam_u32oct ("mode", Opt_mode), fsparam_u32 ("uid", Opt_uid), {} }; static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct debugfs_fs_info *opts = fc->s_fs_info; struct fs_parse_result result; kuid_t uid; kgid_t gid; int opt; opt = fs_parse(fc, debugfs_param_specs, param, &result); if (opt < 0) { /* * We might like to report bad mount options here; but * traditionally debugfs has ignored all mount options */ if (opt == -ENOPARAM) return 0; return opt; } switch (opt) { case Opt_uid: uid = make_kuid(current_user_ns(), result.uint_32); if (!uid_valid(uid)) return invalf(fc, "Unknown uid"); opts->uid = uid; break; case Opt_gid: gid = make_kgid(current_user_ns(), result.uint_32); if (!gid_valid(gid)) return invalf(fc, "Unknown gid"); opts->gid = gid; break; case Opt_mode: opts->mode = result.uint_32 & S_IALLUGO; break; /* * We might like to report bad mount options here; * but traditionally debugfs has ignored all mount options */ } opts->opts |= BIT(opt); return 0; } static void _debugfs_apply_options(struct super_block *sb, bool remount) { struct debugfs_fs_info *fsi = sb->s_fs_info; struct inode *inode = d_inode(sb->s_root); /* * On remount, only reset mode/uid/gid if they were provided as mount * options. */ if (!remount || fsi->opts & BIT(Opt_mode)) { inode->i_mode &= ~S_IALLUGO; inode->i_mode |= fsi->mode; } if (!remount || fsi->opts & BIT(Opt_uid)) inode->i_uid = fsi->uid; if (!remount || fsi->opts & BIT(Opt_gid)) inode->i_gid = fsi->gid; } static void debugfs_apply_options(struct super_block *sb) { _debugfs_apply_options(sb, false); } static void debugfs_apply_options_remount(struct super_block *sb) { _debugfs_apply_options(sb, true); } static int debugfs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct debugfs_fs_info *sb_opts = sb->s_fs_info; struct debugfs_fs_info *new_opts = fc->s_fs_info; sync_filesystem(sb); /* structure copy of new mount options to sb */ *sb_opts = *new_opts; debugfs_apply_options_remount(sb); return 0; } static int debugfs_show_options(struct seq_file *m, struct dentry *root) { struct debugfs_fs_info *fsi = root->d_sb->s_fs_info; if (!uid_eq(fsi->uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, fsi->uid)); if (!gid_eq(fsi->gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, fsi->gid)); if (fsi->mode != DEBUGFS_DEFAULT_MODE) seq_printf(m, ",mode=%o", fsi->mode); return 0; } static void debugfs_free_inode(struct inode *inode) { if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); free_inode_nonrcu(inode); } static const struct super_operations debugfs_super_operations = { .statfs = simple_statfs, .show_options = debugfs_show_options, .free_inode = debugfs_free_inode, }; static void debugfs_release_dentry(struct dentry *dentry) { struct debugfs_fsdata *fsd = dentry->d_fsdata; if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) return; /* check it wasn't a dir (no fsdata) or automount (no real_fops) */ if (fsd && fsd->real_fops) { WARN_ON(!list_empty(&fsd->cancellations)); mutex_destroy(&fsd->cancellations_mtx); } kfree(fsd); } static struct vfsmount *debugfs_automount(struct path *path) { struct debugfs_fsdata *fsd = path->dentry->d_fsdata; return fsd->automount(path->dentry, d_inode(path->dentry)->i_private); } static const struct dentry_operations debugfs_dops = { .d_delete = always_delete_dentry, .d_release = debugfs_release_dentry, .d_automount = debugfs_automount, }; static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr debug_files[] = {{""}}; int err; err = simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); if (err) return err; sb->s_op = &debugfs_super_operations; sb->s_d_op = &debugfs_dops; debugfs_apply_options(sb); return 0; } static int debugfs_get_tree(struct fs_context *fc) { if (!(debugfs_allow & DEBUGFS_ALLOW_API)) return -EPERM; return get_tree_single(fc, debugfs_fill_super); } static void debugfs_free_fc(struct fs_context *fc) { kfree(fc->s_fs_info); } static const struct fs_context_operations debugfs_context_ops = { .free = debugfs_free_fc, .parse_param = debugfs_parse_param, .get_tree = debugfs_get_tree, .reconfigure = debugfs_reconfigure, }; static int debugfs_init_fs_context(struct fs_context *fc) { struct debugfs_fs_info *fsi; fsi = kzalloc(sizeof(struct debugfs_fs_info), GFP_KERNEL); if (!fsi) return -ENOMEM; fsi->mode = DEBUGFS_DEFAULT_MODE; fc->s_fs_info = fsi; fc->ops = &debugfs_context_ops; return 0; } static struct file_system_type debug_fs_type = { .owner = THIS_MODULE, .name = "debugfs", .init_fs_context = debugfs_init_fs_context, .parameters = debugfs_param_specs, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("debugfs"); /** * debugfs_lookup() - look up an existing debugfs file * @name: a pointer to a string containing the name of the file to look up. * @parent: a pointer to the parent dentry of the file. * * This function will return a pointer to a dentry if it succeeds. If the file * doesn't exist or an error occurs, %NULL will be returned. The returned * dentry must be passed to dput() when it is no longer needed. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) { struct dentry *dentry; if (!debugfs_initialized() || IS_ERR_OR_NULL(name) || IS_ERR(parent)) return NULL; if (!parent) parent = debugfs_mount->mnt_root; dentry = lookup_positive_unlocked(name, parent, strlen(name)); if (IS_ERR(dentry)) return NULL; return dentry; } EXPORT_SYMBOL_GPL(debugfs_lookup); static struct dentry *start_creating(const char *name, struct dentry *parent) { struct dentry *dentry; int error; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) return ERR_PTR(-EPERM); if (!debugfs_initialized()) return ERR_PTR(-ENOENT); pr_debug("creating file '%s'\n", name); if (IS_ERR(parent)) return parent; error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); if (error) { pr_err("Unable to pin filesystem for file '%s'\n", name); return ERR_PTR(error); } /* If the parent is not specified, we create it in the root. * We need the root dentry to do this, which is in the super * block. A pointer to that is in the struct vfsmount that we * have around. */ if (!parent) parent = debugfs_mount->mnt_root; inode_lock(d_inode(parent)); if (unlikely(IS_DEADDIR(d_inode(parent)))) dentry = ERR_PTR(-ENOENT); else dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { if (d_is_dir(dentry)) pr_err("Directory '%s' with parent '%s' already present!\n", name, parent->d_name.name); else pr_err("File '%s' in directory '%s' already present!\n", name, parent->d_name.name); dput(dentry); dentry = ERR_PTR(-EEXIST); } if (IS_ERR(dentry)) { inode_unlock(d_inode(parent)); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } return dentry; } static struct dentry *failed_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); return ERR_PTR(-ENOMEM); } static struct dentry *end_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); return dentry; } static struct dentry *__debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *proxy_fops, const struct file_operations *real_fops) { struct dentry *dentry; struct inode *inode; if (!(mode & S_IFMT)) mode |= S_IFREG; BUG_ON(!S_ISREG(mode)); dentry = start_creating(name, parent); if (IS_ERR(dentry)) return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); return ERR_PTR(-EPERM); } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create file '%s'\n", name); return failed_creating(dentry); } inode->i_mode = mode; inode->i_private = data; inode->i_op = &debugfs_file_inode_operations; inode->i_fop = proxy_fops; dentry->d_fsdata = (void *)((unsigned long)real_fops | DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); d_instantiate(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } /** * debugfs_create_file - create a file in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * * This is the basic "create a file" function for debugfs. It allows for a * wide range of flexibility in creating a file, or a directory (if you want * to create a directory, the debugfs_create_dir() function is * recommended to be used instead.) * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. * * NOTE: it's expected that most callers should _ignore_ the errors returned * by this function. Other debugfs functions handle the fact that the "dentry" * passed to them could be an error and they don't crash in that case. * Drivers should generally work fine even if debugfs fails to init anyway. */ struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { return __debugfs_create_file(name, mode, parent, data, fops ? &debugfs_full_proxy_file_operations : &debugfs_noop_file_operations, fops); } EXPORT_SYMBOL_GPL(debugfs_create_file); /** * debugfs_create_file_unsafe - create a file in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * * debugfs_create_file_unsafe() is completely analogous to * debugfs_create_file(), the only difference being that the fops * handed it will not get protected against file removals by the * debugfs core. * * It is your responsibility to protect your struct file_operation * methods against file removals by means of debugfs_file_get() * and debugfs_file_put(). ->open() is still protected by * debugfs though. * * Any struct file_operations defined by means of * DEFINE_DEBUGFS_ATTRIBUTE() is protected against file removals and * thus, may be used here. */ struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { return __debugfs_create_file(name, mode, parent, data, fops ? &debugfs_open_proxy_file_operations : &debugfs_noop_file_operations, fops); } EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe); /** * debugfs_create_file_size - create a file in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * @file_size: initial file size * * This is the basic "create a file" function for debugfs. It allows for a * wide range of flexibility in creating a file, or a directory (if you want * to create a directory, the debugfs_create_dir() function is * recommended to be used instead.) */ void debugfs_create_file_size(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops, loff_t file_size) { struct dentry *de = debugfs_create_file(name, mode, parent, data, fops); if (!IS_ERR(de)) d_inode(de)->i_size = file_size; } EXPORT_SYMBOL_GPL(debugfs_create_file_size); /** * debugfs_create_dir - create a directory in the debugfs filesystem * @name: a pointer to a string containing the name of the directory to * create. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * directory will be created in the root of the debugfs filesystem. * * This function creates a directory in debugfs with the given name. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. * * NOTE: it's expected that most callers should _ignore_ the errors returned * by this function. Other debugfs functions handle the fact that the "dentry" * passed to them could be an error and they don't crash in that case. * Drivers should generally work fine even if debugfs fails to init anyway. */ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) { struct dentry *dentry = start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); return ERR_PTR(-EPERM); } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create directory '%s'\n", name); return failed_creating(dentry); } inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; inode->i_op = &debugfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_dir); /** * debugfs_create_automount - create automount point in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @f: function to be called when pathname resolution steps on that one. * @data: opaque argument to pass to f(). * * @f should return what ->d_automount() would. */ struct dentry *debugfs_create_automount(const char *name, struct dentry *parent, debugfs_automount_t f, void *data) { struct dentry *dentry = start_creating(name, parent); struct debugfs_fsdata *fsd; struct inode *inode; if (IS_ERR(dentry)) return dentry; fsd = kzalloc(sizeof(*fsd), GFP_KERNEL); if (!fsd) { failed_creating(dentry); return ERR_PTR(-ENOMEM); } fsd->automount = f; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); kfree(fsd); return ERR_PTR(-EPERM); } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create automount '%s'\n", name); kfree(fsd); return failed_creating(dentry); } make_empty_dir_inode(inode); inode->i_flags |= S_AUTOMOUNT; inode->i_private = data; dentry->d_fsdata = fsd; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL(debugfs_create_automount); /** * debugfs_create_symlink- create a symbolic link in the debugfs filesystem * @name: a pointer to a string containing the name of the symbolic link to * create. * @parent: a pointer to the parent dentry for this symbolic link. This * should be a directory dentry if set. If this parameter is NULL, * then the symbolic link will be created in the root of the debugfs * filesystem. * @target: a pointer to a string containing the path to the target of the * symbolic link. * * This function creates a symbolic link with the given name in debugfs that * links to the given target path. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the symbolic * link is to be removed (no automatic cleanup happens if your module is * unloaded, you are responsible here.) If an error occurs, ERR_PTR(-ERROR) * will be returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, const char *target) { struct dentry *dentry; struct inode *inode; char *link = kstrdup(target, GFP_KERNEL); if (!link) return ERR_PTR(-ENOMEM); dentry = start_creating(name, parent); if (IS_ERR(dentry)) { kfree(link); return dentry; } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create symlink '%s'\n", name); kfree(link); return failed_creating(dentry); } inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_op = &debugfs_symlink_inode_operations; inode->i_link = link; d_instantiate(dentry, inode); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_symlink); static void __debugfs_file_removed(struct dentry *dentry) { struct debugfs_fsdata *fsd; /* * Paired with the closing smp_mb() implied by a successful * cmpxchg() in debugfs_file_get(): either * debugfs_file_get() must see a dead dentry or we must see a * debugfs_fsdata instance at ->d_fsdata here (or both). */ smp_mb(); fsd = READ_ONCE(dentry->d_fsdata); if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) return; /* if this was the last reference, we're done */ if (refcount_dec_and_test(&fsd->active_users)) return; /* * If there's still a reference, the code that obtained it can * be in different states: * - The common case of not using cancellations, or already * after debugfs_leave_cancellation(), where we just need * to wait for debugfs_file_put() which signals the completion; * - inside a cancellation section, i.e. between * debugfs_enter_cancellation() and debugfs_leave_cancellation(), * in which case we need to trigger the ->cancel() function, * and then wait for debugfs_file_put() just like in the * previous case; * - before debugfs_enter_cancellation() (but obviously after * debugfs_file_get()), in which case we may not see the * cancellation in the list on the first round of the loop, * but debugfs_enter_cancellation() signals the completion * after adding it, so this code gets woken up to call the * ->cancel() function. */ while (refcount_read(&fsd->active_users)) { struct debugfs_cancellation *c; /* * Lock the cancellations. Note that the cancellations * structs are meant to be on the stack, so we need to * ensure we either use them here or don't touch them, * and debugfs_leave_cancellation() will wait for this * to be finished processing before exiting one. It may * of course win and remove the cancellation, but then * chances are we never even got into this bit, we only * do if the refcount isn't zero already. */ mutex_lock(&fsd->cancellations_mtx); while ((c = list_first_entry_or_null(&fsd->cancellations, typeof(*c), list))) { list_del_init(&c->list); c->cancel(dentry, c->cancel_data); } mutex_unlock(&fsd->cancellations_mtx); wait_for_completion(&fsd->active_users_drained); } } static void remove_one(struct dentry *victim) { if (d_is_reg(victim)) __debugfs_file_removed(victim); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } /** * debugfs_remove - recursively removes a directory * @dentry: a pointer to a the dentry of the directory to be removed. If this * parameter is NULL or an error value, nothing will be done. * * This function recursively removes a directory tree in debugfs that * was previously created with a call to another debugfs function * (like debugfs_create_file() or variants thereof.) * * This function is required to be called in order for the file to be * removed, no automatic cleanup of files will happen when a module is * removed, you are responsible here. */ void debugfs_remove(struct dentry *dentry) { if (IS_ERR_OR_NULL(dentry)) return; simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); simple_recursive_removal(dentry, remove_one); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } EXPORT_SYMBOL_GPL(debugfs_remove); /** * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it * @name: a pointer to a string containing the name of the item to look up. * @parent: a pointer to the parent dentry of the item. * * This is the equlivant of doing something like * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting * handled for the directory being looked up. */ void debugfs_lookup_and_remove(const char *name, struct dentry *parent) { struct dentry *dentry; dentry = debugfs_lookup(name, parent); if (!dentry) return; debugfs_remove(dentry); dput(dentry); } EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove); /** * debugfs_rename - rename a file/directory in the debugfs filesystem * @old_dir: a pointer to the parent dentry for the renamed object. This * should be a directory dentry. * @old_dentry: dentry of an object to be renamed. * @new_dir: a pointer to the parent dentry where the object should be * moved. This should be a directory dentry. * @new_name: a pointer to a string containing the target name. * * This function renames a file/directory in debugfs. The target must not * exist for rename to succeed. * * This function will return a pointer to old_dentry (which is updated to * reflect renaming) if it succeeds. If an error occurs, ERR_PTR(-ERROR) * will be returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *new_dir, const char *new_name) { int error; struct dentry *dentry = NULL, *trap; struct name_snapshot old_name; if (IS_ERR(old_dir)) return old_dir; if (IS_ERR(new_dir)) return new_dir; if (IS_ERR_OR_NULL(old_dentry)) return old_dentry; trap = lock_rename(new_dir, old_dir); /* Source or destination directories don't exist? */ if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir)) goto exit; /* Source does not exist, cyclic rename, or mountpoint? */ if (d_really_is_negative(old_dentry) || old_dentry == trap || d_mountpoint(old_dentry)) goto exit; dentry = lookup_one_len(new_name, new_dir, strlen(new_name)); /* Lookup failed, cyclic rename or target exists? */ if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry)) goto exit; take_dentry_name_snapshot(&old_name, old_dentry); error = simple_rename(&nop_mnt_idmap, d_inode(old_dir), old_dentry, d_inode(new_dir), dentry, 0); if (error) { release_dentry_name_snapshot(&old_name); goto exit; } d_move(old_dentry, dentry); fsnotify_move(d_inode(old_dir), d_inode(new_dir), &old_name.name, d_is_dir(old_dentry), NULL, old_dentry); release_dentry_name_snapshot(&old_name); unlock_rename(new_dir, old_dir); dput(dentry); return old_dentry; exit: if (dentry && !IS_ERR(dentry)) dput(dentry); unlock_rename(new_dir, old_dir); if (IS_ERR(dentry)) return dentry; return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(debugfs_rename); /** * debugfs_initialized - Tells whether debugfs has been registered */ bool debugfs_initialized(void) { return debugfs_registered; } EXPORT_SYMBOL_GPL(debugfs_initialized); static int __init debugfs_kernel(char *str) { if (str) { if (!strcmp(str, "on")) debugfs_allow = DEBUGFS_ALLOW_API | DEBUGFS_ALLOW_MOUNT; else if (!strcmp(str, "no-mount")) debugfs_allow = DEBUGFS_ALLOW_API; else if (!strcmp(str, "off")) debugfs_allow = 0; } return 0; } early_param("debugfs", debugfs_kernel); static int __init debugfs_init(void) { int retval; if (!(debugfs_allow & DEBUGFS_ALLOW_MOUNT)) return -EPERM; retval = sysfs_create_mount_point(kernel_kobj, "debug"); if (retval) return retval; retval = register_filesystem(&debug_fs_type); if (retval) sysfs_remove_mount_point(kernel_kobj, "debug"); else debugfs_registered = true; return retval; } core_initcall(debugfs_init);
128 54 87 21 1 4 4 11 117 20 27 49 115 51 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM io_uring #if !defined(_TRACE_IO_URING_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IO_URING_H #include <linux/tracepoint.h> #include <uapi/linux/io_uring.h> #include <linux/io_uring_types.h> #include <linux/io_uring.h> struct io_wq_work; /** * io_uring_create - called after a new io_uring context was prepared * * @fd: corresponding file descriptor * @ctx: pointer to a ring context structure * @sq_entries: actual SQ size * @cq_entries: actual CQ size * @flags: SQ ring flags, provided to io_uring_setup(2) * * Allows to trace io_uring creation and provide pointer to a context, that can * be used later to find correlated events. */ TRACE_EVENT(io_uring_create, TP_PROTO(int fd, void *ctx, u32 sq_entries, u32 cq_entries, u32 flags), TP_ARGS(fd, ctx, sq_entries, cq_entries, flags), TP_STRUCT__entry ( __field( int, fd ) __field( void *, ctx ) __field( u32, sq_entries ) __field( u32, cq_entries ) __field( u32, flags ) ), TP_fast_assign( __entry->fd = fd; __entry->ctx = ctx; __entry->sq_entries = sq_entries; __entry->cq_entries = cq_entries; __entry->flags = flags; ), TP_printk("ring %p, fd %d sq size %d, cq size %d, flags 0x%x", __entry->ctx, __entry->fd, __entry->sq_entries, __entry->cq_entries, __entry->flags) ); /** * io_uring_register - called after a buffer/file/eventfd was successfully * registered for a ring * * @ctx: pointer to a ring context structure * @opcode: describes which operation to perform * @nr_user_files: number of registered files * @nr_user_bufs: number of registered buffers * @ret: return code * * Allows to trace fixed files/buffers, that could be registered to * avoid an overhead of getting references to them for every operation. This * event, together with io_uring_file_get, can provide a full picture of how * much overhead one can reduce via fixing. */ TRACE_EVENT(io_uring_register, TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files, unsigned nr_bufs, long ret), TP_ARGS(ctx, opcode, nr_files, nr_bufs, ret), TP_STRUCT__entry ( __field( void *, ctx ) __field( unsigned, opcode ) __field( unsigned, nr_files) __field( unsigned, nr_bufs ) __field( long, ret ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->nr_files = nr_files; __entry->nr_bufs = nr_bufs; __entry->ret = ret; ), TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, " "ret %ld", __entry->ctx, __entry->opcode, __entry->nr_files, __entry->nr_bufs, __entry->ret) ); /** * io_uring_file_get - called before getting references to an SQE file * * @req: pointer to a submitted request * @fd: SQE file descriptor * * Allows to trace out how often an SQE file reference is obtained, which can * help figuring out if it makes sense to use fixed files, or check that fixed * files are used correctly. */ TRACE_EVENT(io_uring_file_get, TP_PROTO(struct io_kiocb *req, int fd), TP_ARGS(req, fd), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( u64, user_data ) __field( int, fd ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->fd = fd; ), TP_printk("ring %p, req %p, user_data 0x%llx, fd %d", __entry->ctx, __entry->req, __entry->user_data, __entry->fd) ); /** * io_uring_queue_async_work - called before submitting a new async work * * @req: pointer to a submitted request * @rw: type of workqueue, hashed or normal * * Allows to trace asynchronous work submission. */ TRACE_EVENT(io_uring_queue_async_work, TP_PROTO(struct io_kiocb *req, int rw), TP_ARGS(req, rw), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( u64, user_data ) __field( u8, opcode ) __field( unsigned long long, flags ) __field( struct io_wq_work *, work ) __field( int, rw ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->flags = (__force unsigned long long) req->flags; __entry->opcode = req->opcode; __entry->work = &req->work; __entry->rw = rw; __assign_str(op_str); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) ); /** * io_uring_defer - called when an io_uring request is deferred * * @req: pointer to a deferred request * * Allows to track deferred requests, to get an insight about what requests are * not started immediately. */ TRACE_EVENT(io_uring_defer, TP_PROTO(struct io_kiocb *req), TP_ARGS(req), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, data ) __field( u8, opcode ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->data = req->cqe.user_data; __entry->opcode = req->opcode; __assign_str(op_str); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s", __entry->ctx, __entry->req, __entry->data, __get_str(op_str)) ); /** * io_uring_link - called before the io_uring request added into link_list of * another request * * @req: pointer to a linked request * @target_req: pointer to a previous request, that would contain @req * * Allows to track linked requests, to understand dependencies between requests * and how does it influence their execution flow. */ TRACE_EVENT(io_uring_link, TP_PROTO(struct io_kiocb *req, struct io_kiocb *target_req), TP_ARGS(req, target_req), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( void *, target_req ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->target_req = target_req; ), TP_printk("ring %p, request %p linked after %p", __entry->ctx, __entry->req, __entry->target_req) ); /** * io_uring_cqring_wait - called before start waiting for an available CQE * * @ctx: pointer to a ring context structure * @min_events: minimal number of events to wait for * * Allows to track waiting for CQE, so that we can e.g. troubleshoot * situations, when an application wants to wait for an event, that never * comes. */ TRACE_EVENT(io_uring_cqring_wait, TP_PROTO(void *ctx, int min_events), TP_ARGS(ctx, min_events), TP_STRUCT__entry ( __field( void *, ctx ) __field( int, min_events ) ), TP_fast_assign( __entry->ctx = ctx; __entry->min_events = min_events; ), TP_printk("ring %p, min_events %d", __entry->ctx, __entry->min_events) ); /** * io_uring_fail_link - called before failing a linked request * * @req: request, which links were cancelled * @link: cancelled link * * Allows to track linked requests cancellation, to see not only that some work * was cancelled, but also which request was the reason. */ TRACE_EVENT(io_uring_fail_link, TP_PROTO(struct io_kiocb *req, struct io_kiocb *link), TP_ARGS(req, link), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( void *, link ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->link = link; __assign_str(op_str); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->link) ); /** * io_uring_complete - called when completing an SQE * * @ctx: pointer to a ring context structure * @req: pointer to a submitted request * @user_data: user data associated with the request * @res: result of the request * @cflags: completion flags * @extra1: extra 64-bit data for CQE32 * @extra2: extra 64-bit data for CQE32 * */ TRACE_EVENT(io_uring_complete, TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags, u64 extra1, u64 extra2), TP_ARGS(ctx, req, user_data, res, cflags, extra1, extra2), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( u64, user_data ) __field( int, res ) __field( unsigned, cflags ) __field( u64, extra1 ) __field( u64, extra2 ) ), TP_fast_assign( __entry->ctx = ctx; __entry->req = req; __entry->user_data = user_data; __entry->res = res; __entry->cflags = cflags; __entry->extra1 = extra1; __entry->extra2 = extra2; ), TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x " "extra1 %llu extra2 %llu ", __entry->ctx, __entry->req, __entry->user_data, __entry->res, __entry->cflags, (unsigned long long) __entry->extra1, (unsigned long long) __entry->extra2) ); /** * io_uring_submit_req - called before submitting a request * * @req: pointer to a submitted request * * Allows to track SQE submitting, to understand what was the source of it, SQ * thread or io_uring_enter call. */ TRACE_EVENT(io_uring_submit_req, TP_PROTO(struct io_kiocb *req), TP_ARGS(req), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( unsigned long long, flags ) __field( bool, sq_thread ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->flags = (__force unsigned long long) req->flags; __entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%llx, " "sq_thread %d", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->flags, __entry->sq_thread) ); /* * io_uring_poll_arm - called after arming a poll wait if successful * * @req: pointer to the armed request * @mask: request poll events mask * @events: registered events of interest * * Allows to track which fds are waiting for and what are the events of * interest. */ TRACE_EVENT(io_uring_poll_arm, TP_PROTO(struct io_kiocb *req, int mask, int events), TP_ARGS(req, mask, events), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( int, mask ) __field( int, events ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->mask = mask; __entry->events = events; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->mask, __entry->events) ); /* * io_uring_task_add - called after adding a task * * @req: pointer to request * @mask: request poll events mask * */ TRACE_EVENT(io_uring_task_add, TP_PROTO(struct io_kiocb *req, int mask), TP_ARGS(req, mask), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( int, mask ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->mask = mask; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->mask) ); /* * io_uring_req_failed - called when an sqe is errored dring submission * * @sqe: pointer to the io_uring_sqe that failed * @req: pointer to request * @error: error it failed with * * Allows easier diagnosing of malformed requests in production systems. */ TRACE_EVENT(io_uring_req_failed, TP_PROTO(const struct io_uring_sqe *sqe, struct io_kiocb *req, int error), TP_ARGS(sqe, req, error), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( u8, flags ) __field( u8, ioprio ) __field( u64, off ) __field( u64, addr ) __field( u32, len ) __field( u32, op_flags ) __field( u16, buf_index ) __field( u16, personality ) __field( u32, file_index ) __field( u64, pad1 ) __field( u64, addr3 ) __field( int, error ) __string( op_str, io_uring_get_opcode(sqe->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = sqe->user_data; __entry->opcode = sqe->opcode; __entry->flags = sqe->flags; __entry->ioprio = sqe->ioprio; __entry->off = sqe->off; __entry->addr = sqe->addr; __entry->len = sqe->len; __entry->op_flags = sqe->poll32_events; __entry->buf_index = sqe->buf_index; __entry->personality = sqe->personality; __entry->file_index = sqe->file_index; __entry->pad1 = sqe->__pad2[0]; __entry->addr3 = sqe->addr3; __entry->error = error; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, " "opcode %s, flags 0x%x, prio=%d, off=%llu, addr=%llu, " "len=%u, rw_flags=0x%x, buf_index=%d, " "personality=%d, file_index=%d, pad=0x%llx, addr3=%llx, " "error=%d", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->flags, __entry->ioprio, (unsigned long long)__entry->off, (unsigned long long) __entry->addr, __entry->len, __entry->op_flags, __entry->buf_index, __entry->personality, __entry->file_index, (unsigned long long) __entry->pad1, (unsigned long long) __entry->addr3, __entry->error) ); /* * io_uring_cqe_overflow - a CQE overflowed * * @ctx: pointer to a ring context structure * @user_data: user data associated with the request * @res: CQE result * @cflags: CQE flags * @ocqe: pointer to the overflow cqe (if available) * */ TRACE_EVENT(io_uring_cqe_overflow, TP_PROTO(void *ctx, unsigned long long user_data, s32 res, u32 cflags, void *ocqe), TP_ARGS(ctx, user_data, res, cflags, ocqe), TP_STRUCT__entry ( __field( void *, ctx ) __field( unsigned long long, user_data ) __field( s32, res ) __field( u32, cflags ) __field( void *, ocqe ) ), TP_fast_assign( __entry->ctx = ctx; __entry->user_data = user_data; __entry->res = res; __entry->cflags = cflags; __entry->ocqe = ocqe; ), TP_printk("ring %p, user_data 0x%llx, res %d, cflags 0x%x, " "overflow_cqe %p", __entry->ctx, __entry->user_data, __entry->res, __entry->cflags, __entry->ocqe) ); /* * io_uring_task_work_run - ran task work * * @tctx: pointer to a io_uring_task * @count: how many functions it ran * */ TRACE_EVENT(io_uring_task_work_run, TP_PROTO(void *tctx, unsigned int count), TP_ARGS(tctx, count), TP_STRUCT__entry ( __field( void *, tctx ) __field( unsigned int, count ) ), TP_fast_assign( __entry->tctx = tctx; __entry->count = count; ), TP_printk("tctx %p, count %u", __entry->tctx, __entry->count) ); TRACE_EVENT(io_uring_short_write, TP_PROTO(void *ctx, u64 fpos, u64 wanted, u64 got), TP_ARGS(ctx, fpos, wanted, got), TP_STRUCT__entry( __field(void *, ctx) __field(u64, fpos) __field(u64, wanted) __field(u64, got) ), TP_fast_assign( __entry->ctx = ctx; __entry->fpos = fpos; __entry->wanted = wanted; __entry->got = got; ), TP_printk("ring %p, fpos %lld, wanted %lld, got %lld", __entry->ctx, __entry->fpos, __entry->wanted, __entry->got) ); /* * io_uring_local_work_run - ran ring local task work * * @tctx: pointer to a io_uring_ctx * @count: how many functions it ran * @loops: how many loops it ran * */ TRACE_EVENT(io_uring_local_work_run, TP_PROTO(void *ctx, int count, unsigned int loops), TP_ARGS(ctx, count, loops), TP_STRUCT__entry ( __field(void *, ctx ) __field(int, count ) __field(unsigned int, loops ) ), TP_fast_assign( __entry->ctx = ctx; __entry->count = count; __entry->loops = loops; ), TP_printk("ring %p, count %d, loops %u", __entry->ctx, __entry->count, __entry->loops) ); #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1908 1457 1471 4 4 1 4 4 48 48 48 48 1 47 47 47 47 47 47 47 47 47 47 47 47 47 1 1 1 153 45 45 1671 1578 46 3 45 4 3 4 3 3 3 4 3 3 1 1 1 338 328 327 323 336 116 114 63 35 35 34 35 190 190 190 232 232 227 233 13 13 13 1821 1790 1793 73 1471 1632 1619 1437 1632 1631 1801 2 4 4 3 3 3 3 2 2 2 2 3 7 7 7 7 7 7 7 3 3 2 1 1 3 3 3 3 2 1 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 // SPDX-License-Identifier: GPL-2.0-only /* * Generic pidhash and scalable, time-bounded PID allocator * * (C) 2002-2003 Nadia Yvette Chambers, IBM * (C) 2004 Nadia Yvette Chambers, Oracle * (C) 2002-2004 Ingo Molnar, Red Hat * * pid-structures are backing objects for tasks sharing a given ID to chain * against. There is very little to them aside from hashing them and * parking tasks using given ID's on a list. * * The hash is always changed with the tasklist_lock write-acquired, * and the hash is only accessed with the tasklist_lock at least * read-acquired, so there's no additional SMP locking needed here. * * We have a list of bitmap pages, which bitmaps represent the PID space. * Allocating and freeing PIDs is completely lockless. The worst-case * allocation scenario when all but one out of 1 million PIDs possible are * allocated already: the scanning of 32 list entries and at most PAGE_SIZE * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). * * Pid namespaces: * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM * Many thanks to Oleg Nesterov for comments and help * */ #include <linux/mm.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/memblock.h> #include <linux/pid_namespace.h> #include <linux/init_task.h> #include <linux/syscalls.h> #include <linux/proc_ns.h> #include <linux/refcount.h> #include <linux/anon_inodes.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/idr.h> #include <linux/pidfs.h> #include <net/sock.h> #include <uapi/linux/pidfd.h> struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), .tasks = { { .first = NULL }, { .first = NULL }, { .first = NULL }, }, .level = 0, .numbers = { { .nr = 0, .ns = &init_pid_ns, }, } }; int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; /* * Pseudo filesystems start inode numbering after one. We use Reserved * PIDs as a natural offset. */ static u64 pidfs_ino = RESERVED_PIDS; /* * PID-map pages start out as NULL, they get allocated upon * first use and are never deallocated. This way a low pid_max * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { .ns.count = REFCOUNT_INIT(2), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, .ns.inum = PROC_PID_INIT_INO, #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif }; EXPORT_SYMBOL_GPL(init_pid_ns); /* * Note: disable interrupts while the pidmap_lock is held as an * interrupt might come in and do read_lock(&tasklist_lock). * * If we don't disable interrupts there is a nasty deadlock between * detach_pid()->free_pid() and another cpu that does * spin_lock(&pidmap_lock) followed by an interrupt routine that does * read_lock(&tasklist_lock); * * After we clean up the tasklist_lock and know there are no * irq handlers that take it we can leave the interrupts enabled. * For now it is easier to be safe than to prove it can't happen. */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); void put_pid(struct pid *pid) { struct pid_namespace *ns; if (!pid) return; ns = pid->numbers[pid->level].ns; if (refcount_dec_and_test(&pid->count)) { kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } } EXPORT_SYMBOL_GPL(put_pid); static void delayed_put_pid(struct rcu_head *rhp) { struct pid *pid = container_of(rhp, struct pid, rcu); put_pid(pid); } void free_pid(struct pid *pid) { /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; switch (--ns->pid_allocated) { case 2: case 1: /* When all that is left in the pid namespace * is the reaper wake up the reaper. The reaper * may be sleeping in zap_pid_ns_processes(). */ wake_up_process(ns->child_reaper); break; case PIDNS_ADDING: /* Handle a fork failure of the first process */ WARN_ON(ns->child_reaper); ns->pid_allocated = 0; break; } idr_remove(&ns->idr, upid->nr); } spin_unlock_irqrestore(&pidmap_lock, flags); call_rcu(&pid->rcu, delayed_put_pid); } struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size) { struct pid *pid; enum pid_type type; int i, nr; struct pid_namespace *tmp; struct upid *upid; int retval = -ENOMEM; /* * set_tid_size contains the size of the set_tid array. Starting at * the most nested currently active PID namespace it tells alloc_pid() * which PID to set for a process in that most nested PID namespace * up to set_tid_size PID namespaces. It does not have to set the PID * for a process in all nested PID namespaces but set_tid_size must * never be greater than the current ns->level + 1. */ if (set_tid_size > ns->level + 1) return ERR_PTR(-EINVAL); pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) return ERR_PTR(retval); tmp = ns; pid->level = ns->level; for (i = ns->level; i >= 0; i--) { int tid = 0; if (set_tid_size) { tid = set_tid[ns->level - i]; retval = -EINVAL; if (tid < 1 || tid >= pid_max) goto out_free; /* * Also fail if a PID != 1 is requested and * no PID 1 exists. */ if (tid != 1 && !tmp->child_reaper) goto out_free; retval = -EPERM; if (!checkpoint_restore_ns_capable(tmp->user_ns)) goto out_free; set_tid_size--; } idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); if (tid) { nr = idr_alloc(&tmp->idr, NULL, tid, tid + 1, GFP_ATOMIC); /* * If ENOSPC is returned it means that the PID is * alreay in use. Return EEXIST in that case. */ if (nr == -ENOSPC) nr = -EEXIST; } else { int pid_min = 1; /* * init really needs pid 1, but after reaching the * maximum wrap back to RESERVED_PIDS */ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) pid_min = RESERVED_PIDS; /* * Store a null pointer so find_pid_ns does not find * a partially initialized PID (see below). */ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, pid_max, GFP_ATOMIC); } spin_unlock_irq(&pidmap_lock); idr_preload_end(); if (nr < 0) { retval = (nr == -ENOSPC) ? -EAGAIN : nr; goto out_free; } pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; tmp = tmp->parent; } /* * ENOMEM is not the most obvious choice especially for the case * where the child subreaper has already exited and the pid * namespace denies the creation of any new processes. But ENOMEM * is what we have exposed to userspace for a long time and it is * documented behavior for pid namespaces. So we can't easily * change it even if there were an error code better suited. */ retval = -ENOMEM; get_pid_ns(ns); refcount_set(&pid->count, 1); spin_lock_init(&pid->lock); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); init_waitqueue_head(&pid->wait_pidfd); INIT_HLIST_HEAD(&pid->inodes); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; pid->stashed = NULL; pid->ino = ++pidfs_ino; for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } spin_unlock_irq(&pidmap_lock); return pid; out_unlock: spin_unlock_irq(&pidmap_lock); put_pid_ns(ns); out_free: spin_lock_irq(&pidmap_lock); while (++i <= ns->level) { upid = pid->numbers + i; idr_remove(&upid->ns->idr, upid->nr); } /* On failure to allocate the first pid, reset the state */ if (ns->pid_allocated == PIDNS_ADDING) idr_set_cursor(&ns->idr, 0); spin_unlock_irq(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); } void disable_pid_allocation(struct pid_namespace *ns) { spin_lock_irq(&pidmap_lock); ns->pid_allocated &= ~PIDNS_ADDING; spin_unlock_irq(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { return idr_find(&ns->idr, nr); } EXPORT_SYMBOL_GPL(find_pid_ns); struct pid *find_vpid(int nr) { return find_pid_ns(nr, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(find_vpid); static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) { return (type == PIDTYPE_PID) ? &task->thread_pid : &task->signal->pids[type]; } /* * attach_pid() must be called with the tasklist_lock write-held. */ void attach_pid(struct task_struct *task, enum pid_type type) { struct pid *pid = *task_pid_ptr(task, type); hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } static void __change_pid(struct task_struct *task, enum pid_type type, struct pid *new) { struct pid **pid_ptr = task_pid_ptr(task, type); struct pid *pid; int tmp; pid = *pid_ptr; hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; if (type == PIDTYPE_PID) { WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID)); wake_up_all(&pid->wait_pidfd); } for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (pid_has_task(pid, tmp)) return; free_pid(pid); } void detach_pid(struct task_struct *task, enum pid_type type) { __change_pid(task, type, NULL); } void change_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { __change_pid(task, type, pid); attach_pid(task, type); } void exchange_tids(struct task_struct *left, struct task_struct *right) { struct pid *pid1 = left->thread_pid; struct pid *pid2 = right->thread_pid; struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; /* Swap the single entry tid lists */ hlists_swap_heads_rcu(head1, head2); /* Swap the per task_struct pid */ rcu_assign_pointer(left->thread_pid, pid2); rcu_assign_pointer(right->thread_pid, pid1); /* Swap the cached value */ WRITE_ONCE(left->pid, pid_nr(pid2)); WRITE_ONCE(right->pid, pid_nr(pid1)); } /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { WARN_ON_ONCE(type == PIDTYPE_PID); hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } struct task_struct *pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result = NULL; if (pid) { struct hlist_node *first; first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), lockdep_tasklist_lock_is_held()); if (first) result = hlist_entry(first, struct task_struct, pid_links[(type)]); } return result; } EXPORT_SYMBOL(pid_task); /* * Must be called under rcu_read_lock(). */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "find_task_by_pid_ns() needs rcu_read_lock() protection"); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } struct task_struct *find_task_by_vpid(pid_t vnr) { return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } struct task_struct *find_get_task_by_vpid(pid_t nr) { struct task_struct *task; rcu_read_lock(); task = find_task_by_vpid(nr); if (task) get_task_struct(task); rcu_read_unlock(); return task; } struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { struct pid *pid; rcu_read_lock(); pid = get_pid(rcu_dereference(*task_pid_ptr(task, type))); rcu_read_unlock(); return pid; } EXPORT_SYMBOL_GPL(get_task_pid); struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result; rcu_read_lock(); result = pid_task(pid, type); if (result) get_task_struct(result); rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(get_pid_task); struct pid *find_get_pid(pid_t nr) { struct pid *pid; rcu_read_lock(); pid = get_pid(find_vpid(nr)); rcu_read_unlock(); return pid; } EXPORT_SYMBOL_GPL(find_get_pid); pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) { struct upid *upid; pid_t nr = 0; if (pid && ns->level <= pid->level) { upid = &pid->numbers[ns->level]; if (upid->ns == ns) nr = upid->nr; } return nr; } EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { return pid_nr_ns(pid, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(pid_vnr); pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns) { pid_t nr = 0; rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; } EXPORT_SYMBOL(__task_pid_nr_ns); struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); } EXPORT_SYMBOL_GPL(task_active_pid_ns); /* * Used by proc to find the first pid that is greater than or equal to nr. * * If there is a pid at nr this function is exactly the same as find_pid_ns. */ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { return idr_get_next(&ns->idr, &nr); } EXPORT_SYMBOL_GPL(find_ge_pid); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) { struct fd f; struct pid *pid; f = fdget(fd); if (!f.file) return ERR_PTR(-EBADF); pid = pidfd_pid(f.file); if (!IS_ERR(pid)) { get_pid(pid); *flags = f.file->f_flags; } fdput(f); return pid; } /** * pidfd_get_task() - Get the task associated with a pidfd * * @pidfd: pidfd for which to get the task * @flags: flags associated with this pidfd * * Return the task associated with @pidfd. The function takes a reference on * the returned task. The caller is responsible for releasing that reference. * * Return: On success, the task_struct associated with the pidfd. * On error, a negative errno number will be returned. */ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) { unsigned int f_flags; struct pid *pid; struct task_struct *task; pid = pidfd_get_pid(pidfd, &f_flags); if (IS_ERR(pid)) return ERR_CAST(pid); task = get_pid_task(pid, PIDTYPE_TGID); put_pid(pid); if (!task) return ERR_PTR(-ESRCH); *flags = f_flags; return task; } /** * pidfd_create() - Create a new pid file descriptor. * * @pid: struct pid that the pidfd will reference * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set. * * Note, that this function can only be called after the fd table has * been unshared to avoid leaking the pidfd to the new process. * * This symbol should not be explicitly exported to loadable modules. * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ static int pidfd_create(struct pid *pid, unsigned int flags) { int pidfd; struct file *pidfd_file; pidfd = pidfd_prepare(pid, flags, &pidfd_file); if (pidfd < 0) return pidfd; fd_install(pidfd, pidfd_file); return pidfd; } /** * sys_pidfd_open() - Open new pid file descriptor. * * @pid: pid for which to retrieve a pidfd * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set for * the task identified by @pid. Without PIDFD_THREAD flag the target task * must be a thread-group leader. * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) { int fd; struct pid *p; if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD)) return -EINVAL; if (pid <= 0) return -EINVAL; p = find_get_pid(pid); if (!p) return -ESRCH; fd = pidfd_create(p, flags); put_pid(p); return fd; } void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); /* bump default and minimum pid_max based on number of cpus */ pid_max = min(pid_max_max, max_t(int, pid_max, PIDS_PER_CPU_DEFAULT * num_possible_cpus())); pid_max_min = max_t(int, pid_max_min, PIDS_PER_CPU_MIN * num_possible_cpus()); pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); idr_init(&init_pid_ns.idr); init_pid_ns.pid_cachep = kmem_cache_create("pid", struct_size_t(struct pid, numbers, 1), __alignof__(struct pid), SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); } static struct file *__pidfd_fget(struct task_struct *task, int fd) { struct file *file; int ret; ret = down_read_killable(&task->signal->exec_update_lock); if (ret) return ERR_PTR(ret); if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) file = fget_task(task, fd); else file = ERR_PTR(-EPERM); up_read(&task->signal->exec_update_lock); if (!file) { /* * It is possible that the target thread is exiting; it can be * either: * 1. before exit_signals(), which gives a real fd * 2. before exit_files() takes the task_lock() gives a real fd * 3. after exit_files() releases task_lock(), ->files is NULL; * this has PF_EXITING, since it was set in exit_signals(), * __pidfd_fget() returns EBADF. * In case 3 we get EBADF, but that really means ESRCH, since * the task is currently exiting and has freed its files * struct, so we fix it up. */ if (task->flags & PF_EXITING) file = ERR_PTR(-ESRCH); else file = ERR_PTR(-EBADF); } return file; } static int pidfd_getfd(struct pid *pid, int fd) { struct task_struct *task; struct file *file; int ret; task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; file = __pidfd_fget(task, fd); put_task_struct(task); if (IS_ERR(file)) return PTR_ERR(file); ret = receive_fd(file, NULL, O_CLOEXEC); fput(file); return ret; } /** * sys_pidfd_getfd() - Get a file descriptor from another process * * @pidfd: the pidfd file descriptor of the process * @fd: the file descriptor number to get * @flags: flags on how to get the fd (reserved) * * This syscall gets a copy of a file descriptor from another process * based on the pidfd, and file descriptor number. It requires that * the calling process has the ability to ptrace the process represented * by the pidfd. The process which is having its file descriptor copied * is otherwise unaffected. * * Return: On success, a cloexec file descriptor is returned. * On error, a negative errno number will be returned. */ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, unsigned int, flags) { struct pid *pid; struct fd f; int ret; /* flags is currently unused - make sure it's unset */ if (flags) return -EINVAL; f = fdget(pidfd); if (!f.file) return -EBADF; pid = pidfd_pid(f.file); if (IS_ERR(pid)) ret = PTR_ERR(pid); else ret = pidfd_getfd(pid, fd); fdput(f); return ret; }
18 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_UIDGID_H #define _LINUX_UIDGID_H /* * A set of types for the internal kernel types representing uids and gids. * * The types defined in this header allow distinguishing which uids and gids in * the kernel are values used by userspace and which uid and gid values are * the internal kernel values. With the addition of user namespaces the values * can be different. Using the type system makes it possible for the compiler * to detect when we overlook these differences. * */ #include <linux/uidgid_types.h> #include <linux/highuid.h> struct user_namespace; extern struct user_namespace init_user_ns; struct uid_gid_map; #define KUIDT_INIT(value) (kuid_t){ value } #define KGIDT_INIT(value) (kgid_t){ value } #ifdef CONFIG_MULTIUSER static inline uid_t __kuid_val(kuid_t uid) { return uid.val; } static inline gid_t __kgid_val(kgid_t gid) { return gid.val; } #else static inline uid_t __kuid_val(kuid_t uid) { return 0; } static inline gid_t __kgid_val(kgid_t gid) { return 0; } #endif #define GLOBAL_ROOT_UID KUIDT_INIT(0) #define GLOBAL_ROOT_GID KGIDT_INIT(0) #define INVALID_UID KUIDT_INIT(-1) #define INVALID_GID KGIDT_INIT(-1) static inline bool uid_eq(kuid_t left, kuid_t right) { return __kuid_val(left) == __kuid_val(right); } static inline bool gid_eq(kgid_t left, kgid_t right) { return __kgid_val(left) == __kgid_val(right); } static inline bool uid_gt(kuid_t left, kuid_t right) { return __kuid_val(left) > __kuid_val(right); } static inline bool gid_gt(kgid_t left, kgid_t right) { return __kgid_val(left) > __kgid_val(right); } static inline bool uid_gte(kuid_t left, kuid_t right) { return __kuid_val(left) >= __kuid_val(right); } static inline bool gid_gte(kgid_t left, kgid_t right) { return __kgid_val(left) >= __kgid_val(right); } static inline bool uid_lt(kuid_t left, kuid_t right) { return __kuid_val(left) < __kuid_val(right); } static inline bool gid_lt(kgid_t left, kgid_t right) { return __kgid_val(left) < __kgid_val(right); } static inline bool uid_lte(kuid_t left, kuid_t right) { return __kuid_val(left) <= __kuid_val(right); } static inline bool gid_lte(kgid_t left, kgid_t right) { return __kgid_val(left) <= __kgid_val(right); } static inline bool uid_valid(kuid_t uid) { return __kuid_val(uid) != (uid_t) -1; } static inline bool gid_valid(kgid_t gid) { return __kgid_val(gid) != (gid_t) -1; } #ifdef CONFIG_USER_NS extern kuid_t make_kuid(struct user_namespace *from, uid_t uid); extern kgid_t make_kgid(struct user_namespace *from, gid_t gid); extern uid_t from_kuid(struct user_namespace *to, kuid_t uid); extern gid_t from_kgid(struct user_namespace *to, kgid_t gid); extern uid_t from_kuid_munged(struct user_namespace *to, kuid_t uid); extern gid_t from_kgid_munged(struct user_namespace *to, kgid_t gid); static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { return from_kuid(ns, uid) != (uid_t) -1; } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { return from_kgid(ns, gid) != (gid_t) -1; } u32 map_id_down(struct uid_gid_map *map, u32 id); u32 map_id_up(struct uid_gid_map *map, u32 id); #else static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid) { return KUIDT_INIT(uid); } static inline kgid_t make_kgid(struct user_namespace *from, gid_t gid) { return KGIDT_INIT(gid); } static inline uid_t from_kuid(struct user_namespace *to, kuid_t kuid) { return __kuid_val(kuid); } static inline gid_t from_kgid(struct user_namespace *to, kgid_t kgid) { return __kgid_val(kgid); } static inline uid_t from_kuid_munged(struct user_namespace *to, kuid_t kuid) { uid_t uid = from_kuid(to, kuid); if (uid == (uid_t)-1) uid = overflowuid; return uid; } static inline gid_t from_kgid_munged(struct user_namespace *to, kgid_t kgid) { gid_t gid = from_kgid(to, kgid); if (gid == (gid_t)-1) gid = overflowgid; return gid; } static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { return uid_valid(uid); } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { return gid_valid(gid); } static inline u32 map_id_down(struct uid_gid_map *map, u32 id) { return id; } static inline u32 map_id_up(struct uid_gid_map *map, u32 id) { return id; } #endif /* CONFIG_USER_NS */ #endif /* _LINUX_UIDGID_H */
2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 131 131 131 131 131 20 20 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * This file implements the various access functions for the * PROC file system. This is very similar to the IPv4 version, * except it reports the sockets in the INET6 address family. * * Authors: David S. Miller (davem@caip.rutgers.edu) * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ #include <linux/socket.h> #include <linux/net.h> #include <linux/ipv6.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stddef.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/sock.h> #include <net/tcp.h> #include <net/udp.h> #include <net/transp_v6.h> #include <net/ipv6.h> #define MAX4(a, b, c, d) \ max_t(u32, max_t(u32, a, b), max_t(u32, c, d)) #define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \ IPSTATS_MIB_MAX, ICMP_MIB_MAX) static int sockstat6_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; seq_printf(seq, "TCP6: inuse %d\n", sock_prot_inuse_get(net, &tcpv6_prot)); seq_printf(seq, "UDP6: inuse %d\n", sock_prot_inuse_get(net, &udpv6_prot)); seq_printf(seq, "UDPLITE6: inuse %d\n", sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); seq_printf(seq, "FRAG6: inuse %u memory %lu\n", atomic_read(&net->ipv6.fqdir->rhashtable.nelems), frag_mem_limit(net->ipv6.fqdir)); return 0; } static const struct snmp_mib snmp6_ipstats_list[] = { /* ipv6 mib according to RFC 2465 */ SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INPKTS), SNMP_MIB_ITEM("Ip6InHdrErrors", IPSTATS_MIB_INHDRERRORS), SNMP_MIB_ITEM("Ip6InTooBigErrors", IPSTATS_MIB_INTOOBIGERRORS), SNMP_MIB_ITEM("Ip6InNoRoutes", IPSTATS_MIB_INNOROUTES), SNMP_MIB_ITEM("Ip6InAddrErrors", IPSTATS_MIB_INADDRERRORS), SNMP_MIB_ITEM("Ip6InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), SNMP_MIB_ITEM("Ip6InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), SNMP_MIB_ITEM("Ip6InDiscards", IPSTATS_MIB_INDISCARDS), SNMP_MIB_ITEM("Ip6InDelivers", IPSTATS_MIB_INDELIVERS), SNMP_MIB_ITEM("Ip6OutForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTREQUESTS), SNMP_MIB_ITEM("Ip6OutDiscards", IPSTATS_MIB_OUTDISCARDS), SNMP_MIB_ITEM("Ip6OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), SNMP_MIB_ITEM("Ip6ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), SNMP_MIB_ITEM("Ip6ReasmReqds", IPSTATS_MIB_REASMREQDS), SNMP_MIB_ITEM("Ip6ReasmOKs", IPSTATS_MIB_REASMOKS), SNMP_MIB_ITEM("Ip6ReasmFails", IPSTATS_MIB_REASMFAILS), SNMP_MIB_ITEM("Ip6FragOKs", IPSTATS_MIB_FRAGOKS), SNMP_MIB_ITEM("Ip6FragFails", IPSTATS_MIB_FRAGFAILS), SNMP_MIB_ITEM("Ip6FragCreates", IPSTATS_MIB_FRAGCREATES), SNMP_MIB_ITEM("Ip6InMcastPkts", IPSTATS_MIB_INMCASTPKTS), SNMP_MIB_ITEM("Ip6OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS), SNMP_MIB_ITEM("Ip6InOctets", IPSTATS_MIB_INOCTETS), SNMP_MIB_ITEM("Ip6OutOctets", IPSTATS_MIB_OUTOCTETS), SNMP_MIB_ITEM("Ip6InMcastOctets", IPSTATS_MIB_INMCASTOCTETS), SNMP_MIB_ITEM("Ip6OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), /* IPSTATS_MIB_CSUMERRORS is not relevant in IPv6 (no checksum) */ SNMP_MIB_ITEM("Ip6InNoECTPkts", IPSTATS_MIB_NOECTPKTS), SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS), SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("Ip6OutTransmits", IPSTATS_MIB_OUTPKTS), SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_icmp6_list[] = { /* icmpv6 mib according to RFC 2466 */ SNMP_MIB_ITEM("Icmp6InMsgs", ICMP6_MIB_INMSGS), SNMP_MIB_ITEM("Icmp6InErrors", ICMP6_MIB_INERRORS), SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST), SNMP_MIB_SENTINEL }; /* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */ static const char *const icmp6type2name[256] = { [ICMPV6_DEST_UNREACH] = "DestUnreachs", [ICMPV6_PKT_TOOBIG] = "PktTooBigs", [ICMPV6_TIME_EXCEED] = "TimeExcds", [ICMPV6_PARAMPROB] = "ParmProblems", [ICMPV6_ECHO_REQUEST] = "Echos", [ICMPV6_ECHO_REPLY] = "EchoReplies", [ICMPV6_MGM_QUERY] = "GroupMembQueries", [ICMPV6_MGM_REPORT] = "GroupMembResponses", [ICMPV6_MGM_REDUCTION] = "GroupMembReductions", [ICMPV6_MLD2_REPORT] = "MLDv2Reports", [NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements", [NDISC_ROUTER_SOLICITATION] = "RouterSolicits", [NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements", [NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits", [NDISC_REDIRECT] = "Redirects", }; static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS), SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS), SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) { char name[32]; int i; /* print by name -- deprecated items */ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { int icmptype; const char *p; icmptype = i & 0xff; p = icmp6type2name[icmptype]; if (!p) /* don't print un-named types here */ continue; snprintf(name, sizeof(name), "Icmp6%s%s", i & 0x100 ? "Out" : "In", p); seq_printf(seq, "%-32s\t%lu\n", name, atomic_long_read(smib + i)); } /* print by number (nonzero only) - ICMPMsgStat format */ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { unsigned long val; val = atomic_long_read(smib + i); if (!val) continue; snprintf(name, sizeof(name), "Icmp6%sType%u", i & 0x100 ? "Out" : "In", i & 0xff); seq_printf(seq, "%-32s\t%lu\n", name, val); } } /* can be called either with percpu mib (pcpumib != NULL), * or shared one (smib != NULL) */ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib, atomic_long_t *smib, const struct snmp_mib *itemlist) { unsigned long buff[SNMP_MIB_MAX]; int i; if (pcpumib) { memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX); snmp_get_cpu_field_batch(buff, itemlist, pcpumib); for (i = 0; itemlist[i].name; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, buff[i]); } else { for (i = 0; itemlist[i].name; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, atomic_long_read(smib + itemlist[i].entry)); } } static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, const struct snmp_mib *itemlist, size_t syncpoff) { u64 buff64[SNMP_MIB_MAX]; int i; memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX); snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff); for (i = 0; itemlist[i].name; i++) seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]); } static int snmp6_seq_show(struct seq_file *seq, void *v) { struct net *net = (struct net *)seq->private; snmp6_seq_show_item64(seq, net->mib.ipv6_statistics, snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, net->mib.icmpv6_statistics, NULL, snmp6_icmp6_list); snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs); snmp6_seq_show_item(seq, net->mib.udp_stats_in6, NULL, snmp6_udp6_list); snmp6_seq_show_item(seq, net->mib.udplite_stats_in6, NULL, snmp6_udplite6_list); return 0; } static int snmp6_dev_seq_show(struct seq_file *seq, void *v) { struct inet6_dev *idev = (struct inet6_dev *)seq->private; seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); snmp6_seq_show_item64(seq, idev->stats.ipv6, snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, snmp6_icmp6_list); snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs); return 0; } int snmp6_register_dev(struct inet6_dev *idev) { struct proc_dir_entry *p; struct net *net; if (!idev || !idev->dev) return -EINVAL; net = dev_net(idev->dev); if (!net->mib.proc_net_devsnmp6) return -ENOENT; p = proc_create_single_data(idev->dev->name, 0444, net->mib.proc_net_devsnmp6, snmp6_dev_seq_show, idev); if (!p) return -ENOMEM; idev->stats.proc_dir_entry = p; return 0; } int snmp6_unregister_dev(struct inet6_dev *idev) { struct net *net = dev_net(idev->dev); if (!net->mib.proc_net_devsnmp6) return -ENOENT; if (!idev->stats.proc_dir_entry) return -EINVAL; proc_remove(idev->stats.proc_dir_entry); idev->stats.proc_dir_entry = NULL; return 0; } static int __net_init ipv6_proc_init_net(struct net *net) { if (!proc_create_net_single("sockstat6", 0444, net->proc_net, sockstat6_seq_show, NULL)) return -ENOMEM; if (!proc_create_net_single("snmp6", 0444, net->proc_net, snmp6_seq_show, NULL)) goto proc_snmp6_fail; net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net); if (!net->mib.proc_net_devsnmp6) goto proc_dev_snmp6_fail; return 0; proc_dev_snmp6_fail: remove_proc_entry("snmp6", net->proc_net); proc_snmp6_fail: remove_proc_entry("sockstat6", net->proc_net); return -ENOMEM; } static void __net_exit ipv6_proc_exit_net(struct net *net) { remove_proc_entry("sockstat6", net->proc_net); remove_proc_entry("dev_snmp6", net->proc_net); remove_proc_entry("snmp6", net->proc_net); } static struct pernet_operations ipv6_proc_ops = { .init = ipv6_proc_init_net, .exit = ipv6_proc_exit_net, }; int __init ipv6_misc_proc_init(void) { return register_pernet_subsys(&ipv6_proc_ops); } void ipv6_misc_proc_exit(void) { unregister_pernet_subsys(&ipv6_proc_ops); }
109 74 89 84 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 /* * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * Copyright (c) 2009-2010, Code Aurora Forum. * Copyright 2016 Intel Corp. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #ifndef _DRM_DRV_H_ #define _DRM_DRV_H_ #include <linux/list.h> #include <linux/irqreturn.h> #include <video/nomodeset.h> #include <drm/drm_device.h> struct drm_file; struct drm_gem_object; struct drm_master; struct drm_minor; struct dma_buf; struct dma_buf_attachment; struct drm_display_mode; struct drm_mode_create_dumb; struct drm_printer; struct sg_table; /** * enum drm_driver_feature - feature flags * * See &drm_driver.driver_features, drm_device.driver_features and * drm_core_check_feature(). */ enum drm_driver_feature { /** * @DRIVER_GEM: * * Driver use the GEM memory manager. This should be set for all modern * drivers. */ DRIVER_GEM = BIT(0), /** * @DRIVER_MODESET: * * Driver supports mode setting interfaces (KMS). */ DRIVER_MODESET = BIT(1), /** * @DRIVER_RENDER: * * Driver supports dedicated render nodes. See also the :ref:`section on * render nodes <drm_render_node>` for details. */ DRIVER_RENDER = BIT(3), /** * @DRIVER_ATOMIC: * * Driver supports the full atomic modesetting userspace API. Drivers * which only use atomic internally, but do not support the full * userspace API (e.g. not all properties converted to atomic, or * multi-plane updates are not guaranteed to be tear-free) should not * set this flag. */ DRIVER_ATOMIC = BIT(4), /** * @DRIVER_SYNCOBJ: * * Driver supports &drm_syncobj for explicit synchronization of command * submission. */ DRIVER_SYNCOBJ = BIT(5), /** * @DRIVER_SYNCOBJ_TIMELINE: * * Driver supports the timeline flavor of &drm_syncobj for explicit * synchronization of command submission. */ DRIVER_SYNCOBJ_TIMELINE = BIT(6), /** * @DRIVER_COMPUTE_ACCEL: * * Driver supports compute acceleration devices. This flag is mutually exclusive with * @DRIVER_RENDER and @DRIVER_MODESET. Devices that support both graphics and compute * acceleration should be handled by two drivers that are connected using auxiliary bus. */ DRIVER_COMPUTE_ACCEL = BIT(7), /** * @DRIVER_GEM_GPUVA: * * Driver supports user defined GPU VA bindings for GEM objects. */ DRIVER_GEM_GPUVA = BIT(8), /** * @DRIVER_CURSOR_HOTSPOT: * * Driver supports and requires cursor hotspot information in the * cursor plane (e.g. cursor plane has to actually track the mouse * cursor and the clients are required to set hotspot in order for * the cursor planes to work correctly). */ DRIVER_CURSOR_HOTSPOT = BIT(9), /* IMPORTANT: Below are all the legacy flags, add new ones above. */ /** * @DRIVER_USE_AGP: * * Set up DRM AGP support, see drm_agp_init(), the DRM core will manage * AGP resources. New drivers don't need this. */ DRIVER_USE_AGP = BIT(25), /** * @DRIVER_LEGACY: * * Denote a legacy driver using shadow attach. Do not use. */ DRIVER_LEGACY = BIT(26), /** * @DRIVER_PCI_DMA: * * Driver is capable of PCI DMA, mapping of PCI DMA buffers to userspace * will be enabled. Only for legacy drivers. Do not use. */ DRIVER_PCI_DMA = BIT(27), /** * @DRIVER_SG: * * Driver can perform scatter/gather DMA, allocation and mapping of * scatter/gather buffers will be enabled. Only for legacy drivers. Do * not use. */ DRIVER_SG = BIT(28), /** * @DRIVER_HAVE_DMA: * * Driver supports DMA, the userspace DMA API will be supported. Only * for legacy drivers. Do not use. */ DRIVER_HAVE_DMA = BIT(29), /** * @DRIVER_HAVE_IRQ: * * Legacy irq support. Only for legacy drivers. Do not use. */ DRIVER_HAVE_IRQ = BIT(30), }; /** * struct drm_driver - DRM driver structure * * This structure represent the common code for a family of cards. There will be * one &struct drm_device for each card present in this family. It contains lots * of vfunc entries, and a pile of those probably should be moved to more * appropriate places like &drm_mode_config_funcs or into a new operations * structure for GEM drivers. */ struct drm_driver { /** * @load: * * Backward-compatible driver callback to complete initialization steps * after the driver is registered. For this reason, may suffer from * race conditions and its use is deprecated for new drivers. It is * therefore only supported for existing drivers not yet converted to * the new scheme. See devm_drm_dev_alloc() and drm_dev_register() for * proper and race-free way to set up a &struct drm_device. * * This is deprecated, do not use! * * Returns: * * Zero on success, non-zero value on failure. */ int (*load) (struct drm_device *, unsigned long flags); /** * @open: * * Driver callback when a new &struct drm_file is opened. Useful for * setting up driver-private data structures like buffer allocators, * execution contexts or similar things. Such driver-private resources * must be released again in @postclose. * * Since the display/modeset side of DRM can only be owned by exactly * one &struct drm_file (see &drm_file.is_master and &drm_device.master) * there should never be a need to set up any modeset related resources * in this callback. Doing so would be a driver design bug. * * Returns: * * 0 on success, a negative error code on failure, which will be * promoted to userspace as the result of the open() system call. */ int (*open) (struct drm_device *, struct drm_file *); /** * @postclose: * * One of the driver callbacks when a new &struct drm_file is closed. * Useful for tearing down driver-private data structures allocated in * @open like buffer allocators, execution contexts or similar things. * * Since the display/modeset side of DRM can only be owned by exactly * one &struct drm_file (see &drm_file.is_master and &drm_device.master) * there should never be a need to tear down any modeset related * resources in this callback. Doing so would be a driver design bug. */ void (*postclose) (struct drm_device *, struct drm_file *); /** * @lastclose: * * Called when the last &struct drm_file has been closed and there's * currently no userspace client for the &struct drm_device. * * Modern drivers should only use this to force-restore the fbdev * framebuffer using drm_fb_helper_restore_fbdev_mode_unlocked(). * Anything else would indicate there's something seriously wrong. * Modern drivers can also use this to execute delayed power switching * state changes, e.g. in conjunction with the :ref:`vga_switcheroo` * infrastructure. * * This is called after @postclose hook has been called. * * NOTE: * * All legacy drivers use this callback to de-initialize the hardware. * This is purely because of the shadow-attach model, where the DRM * kernel driver does not really own the hardware. Instead ownershipe is * handled with the help of userspace through an inheritedly racy dance * to set/unset the VT into raw mode. * * Legacy drivers initialize the hardware in the @firstopen callback, * which isn't even called for modern drivers. */ void (*lastclose) (struct drm_device *); /** * @unload: * * Reverse the effects of the driver load callback. Ideally, * the clean up performed by the driver should happen in the * reverse order of the initialization. Similarly to the load * hook, this handler is deprecated and its usage should be * dropped in favor of an open-coded teardown function at the * driver layer. See drm_dev_unregister() and drm_dev_put() * for the proper way to remove a &struct drm_device. * * The unload() hook is called right after unregistering * the device. * */ void (*unload) (struct drm_device *); /** * @release: * * Optional callback for destroying device data after the final * reference is released, i.e. the device is being destroyed. * * This is deprecated, clean up all memory allocations associated with a * &drm_device using drmm_add_action(), drmm_kmalloc() and related * managed resources functions. */ void (*release) (struct drm_device *); /** * @master_set: * * Called whenever the minor master is set. Only used by vmwgfx. */ void (*master_set)(struct drm_device *dev, struct drm_file *file_priv, bool from_open); /** * @master_drop: * * Called whenever the minor master is dropped. Only used by vmwgfx. */ void (*master_drop)(struct drm_device *dev, struct drm_file *file_priv); /** * @debugfs_init: * * Allows drivers to create driver-specific debugfs files. */ void (*debugfs_init)(struct drm_minor *minor); /** * @gem_create_object: constructor for gem objects * * Hook for allocating the GEM object struct, for use by the CMA * and SHMEM GEM helpers. Returns a GEM object on success, or an * ERR_PTR()-encoded error code otherwise. */ struct drm_gem_object *(*gem_create_object)(struct drm_device *dev, size_t size); /** * @prime_handle_to_fd: * * PRIME export function. Only used by vmwgfx. */ int (*prime_handle_to_fd)(struct drm_device *dev, struct drm_file *file_priv, uint32_t handle, uint32_t flags, int *prime_fd); /** * @prime_fd_to_handle: * * PRIME import function. Only used by vmwgfx. */ int (*prime_fd_to_handle)(struct drm_device *dev, struct drm_file *file_priv, int prime_fd, uint32_t *handle); /** * @gem_prime_import: * * Import hook for GEM drivers. * * This defaults to drm_gem_prime_import() if not set. */ struct drm_gem_object * (*gem_prime_import)(struct drm_device *dev, struct dma_buf *dma_buf); /** * @gem_prime_import_sg_table: * * Optional hook used by the PRIME helper functions * drm_gem_prime_import() respectively drm_gem_prime_import_dev(). */ struct drm_gem_object *(*gem_prime_import_sg_table)( struct drm_device *dev, struct dma_buf_attachment *attach, struct sg_table *sgt); /** * @dumb_create: * * This creates a new dumb buffer in the driver's backing storage manager (GEM, * TTM or something else entirely) and returns the resulting buffer handle. This * handle can then be wrapped up into a framebuffer modeset object. * * Note that userspace is not allowed to use such objects for render * acceleration - drivers must create their own private ioctls for such a use * case. * * Width, height and depth are specified in the &drm_mode_create_dumb * argument. The callback needs to fill the handle, pitch and size for * the created buffer. * * Called by the user via ioctl. * * Returns: * * Zero on success, negative errno on failure. */ int (*dumb_create)(struct drm_file *file_priv, struct drm_device *dev, struct drm_mode_create_dumb *args); /** * @dumb_map_offset: * * Allocate an offset in the drm device node's address space to be able to * memory map a dumb buffer. * * The default implementation is drm_gem_create_mmap_offset(). GEM based * drivers must not overwrite this. * * Called by the user via ioctl. * * Returns: * * Zero on success, negative errno on failure. */ int (*dumb_map_offset)(struct drm_file *file_priv, struct drm_device *dev, uint32_t handle, uint64_t *offset); /** * @show_fdinfo: * * Print device specific fdinfo. See Documentation/gpu/drm-usage-stats.rst. */ void (*show_fdinfo)(struct drm_printer *p, struct drm_file *f); /** @major: driver major number */ int major; /** @minor: driver minor number */ int minor; /** @patchlevel: driver patch level */ int patchlevel; /** @name: driver name */ char *name; /** @desc: driver description */ char *desc; /** @date: driver date */ char *date; /** * @driver_features: * Driver features, see &enum drm_driver_feature. Drivers can disable * some features on a per-instance basis using * &drm_device.driver_features. */ u32 driver_features; /** * @ioctls: * * Array of driver-private IOCTL description entries. See the chapter on * :ref:`IOCTL support in the userland interfaces * chapter<drm_driver_ioctl>` for the full details. */ const struct drm_ioctl_desc *ioctls; /** @num_ioctls: Number of entries in @ioctls. */ int num_ioctls; /** * @fops: * * File operations for the DRM device node. See the discussion in * :ref:`file operations<drm_driver_fops>` for in-depth coverage and * some examples. */ const struct file_operations *fops; }; void *__devm_drm_dev_alloc(struct device *parent, const struct drm_driver *driver, size_t size, size_t offset); /** * devm_drm_dev_alloc - Resource managed allocation of a &drm_device instance * @parent: Parent device object * @driver: DRM driver * @type: the type of the struct which contains struct &drm_device * @member: the name of the &drm_device within @type. * * This allocates and initialize a new DRM device. No device registration is done. * Call drm_dev_register() to advertice the device to user space and register it * with other core subsystems. This should be done last in the device * initialization sequence to make sure userspace can't access an inconsistent * state. * * The initial ref-count of the object is 1. Use drm_dev_get() and * drm_dev_put() to take and drop further ref-counts. * * It is recommended that drivers embed &struct drm_device into their own device * structure. * * Note that this manages the lifetime of the resulting &drm_device * automatically using devres. The DRM device initialized with this function is * automatically put on driver detach using drm_dev_put(). * * RETURNS: * Pointer to new DRM device, or ERR_PTR on failure. */ #define devm_drm_dev_alloc(parent, driver, type, member) \ ((type *) __devm_drm_dev_alloc(parent, driver, sizeof(type), \ offsetof(type, member))) struct drm_device *drm_dev_alloc(const struct drm_driver *driver, struct device *parent); int drm_dev_register(struct drm_device *dev, unsigned long flags); void drm_dev_unregister(struct drm_device *dev); void drm_dev_get(struct drm_device *dev); void drm_dev_put(struct drm_device *dev); void drm_put_dev(struct drm_device *dev); bool drm_dev_enter(struct drm_device *dev, int *idx); void drm_dev_exit(int idx); void drm_dev_unplug(struct drm_device *dev); /** * drm_dev_is_unplugged - is a DRM device unplugged * @dev: DRM device * * This function can be called to check whether a hotpluggable is unplugged. * Unplugging itself is singalled through drm_dev_unplug(). If a device is * unplugged, these two functions guarantee that any store before calling * drm_dev_unplug() is visible to callers of this function after it completes * * WARNING: This function fundamentally races against drm_dev_unplug(). It is * recommended that drivers instead use the underlying drm_dev_enter() and * drm_dev_exit() function pairs. */ static inline bool drm_dev_is_unplugged(struct drm_device *dev) { int idx; if (drm_dev_enter(dev, &idx)) { drm_dev_exit(idx); return false; } return true; } /** * drm_core_check_all_features - check driver feature flags mask * @dev: DRM device to check * @features: feature flag(s) mask * * This checks @dev for driver features, see &drm_driver.driver_features, * &drm_device.driver_features, and the various &enum drm_driver_feature flags. * * Returns true if all features in the @features mask are supported, false * otherwise. */ static inline bool drm_core_check_all_features(const struct drm_device *dev, u32 features) { u32 supported = dev->driver->driver_features & dev->driver_features; return features && (supported & features) == features; } /** * drm_core_check_feature - check driver feature flags * @dev: DRM device to check * @feature: feature flag * * This checks @dev for driver features, see &drm_driver.driver_features, * &drm_device.driver_features, and the various &enum drm_driver_feature flags. * * Returns true if the @feature is supported, false otherwise. */ static inline bool drm_core_check_feature(const struct drm_device *dev, enum drm_driver_feature feature) { return drm_core_check_all_features(dev, feature); } /** * drm_drv_uses_atomic_modeset - check if the driver implements * atomic_commit() * @dev: DRM device * * This check is useful if drivers do not have DRIVER_ATOMIC set but * have atomic modesetting internally implemented. */ static inline bool drm_drv_uses_atomic_modeset(struct drm_device *dev) { return drm_core_check_feature(dev, DRIVER_ATOMIC) || (dev->mode_config.funcs && dev->mode_config.funcs->atomic_commit != NULL); } /* TODO: Inline drm_firmware_drivers_only() in all its callers. */ static inline bool drm_firmware_drivers_only(void) { return video_firmware_drivers_only(); } #if defined(CONFIG_DEBUG_FS) void drm_debugfs_dev_init(struct drm_device *dev, struct dentry *root); #else static inline void drm_debugfs_dev_init(struct drm_device *dev, struct dentry *root) { } #endif #endif
8 8 8 8 8 8 8 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Intel Corporation. All rights reserved. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/nfc.h> #include <net/nfc/nfc.h> #include "nfc.h" #include "llcp.h" static const u8 llcp_tlv_length[LLCP_TLV_MAX] = { 0, 1, /* VERSION */ 2, /* MIUX */ 2, /* WKS */ 1, /* LTO */ 1, /* RW */ 0, /* SN */ 1, /* OPT */ 0, /* SDREQ */ 2, /* SDRES */ }; static u8 llcp_tlv8(const u8 *tlv, u8 type) { if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]]) return 0; return tlv[2]; } static u16 llcp_tlv16(const u8 *tlv, u8 type) { if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]]) return 0; return be16_to_cpu(*((__be16 *)(tlv + 2))); } static u8 llcp_tlv_version(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_VERSION); } static u16 llcp_tlv_miux(const u8 *tlv) { return llcp_tlv16(tlv, LLCP_TLV_MIUX) & 0x7ff; } static u16 llcp_tlv_wks(const u8 *tlv) { return llcp_tlv16(tlv, LLCP_TLV_WKS); } static u16 llcp_tlv_lto(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_LTO); } static u8 llcp_tlv_opt(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_OPT); } static u8 llcp_tlv_rw(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_RW) & 0xf; } u8 *nfc_llcp_build_tlv(u8 type, const u8 *value, u8 value_length, u8 *tlv_length) { u8 *tlv, length; pr_debug("type %d\n", type); if (type >= LLCP_TLV_MAX) return NULL; length = llcp_tlv_length[type]; if (length == 0 && value_length == 0) return NULL; else if (length == 0) length = value_length; *tlv_length = 2 + length; tlv = kzalloc(2 + length, GFP_KERNEL); if (tlv == NULL) return tlv; tlv[0] = type; tlv[1] = length; memcpy(tlv + 2, value, length); return tlv; } struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap) { struct nfc_llcp_sdp_tlv *sdres; u8 value[2]; sdres = kzalloc(sizeof(struct nfc_llcp_sdp_tlv), GFP_KERNEL); if (sdres == NULL) return NULL; value[0] = tid; value[1] = sap; sdres->tlv = nfc_llcp_build_tlv(LLCP_TLV_SDRES, value, 2, &sdres->tlv_len); if (sdres->tlv == NULL) { kfree(sdres); return NULL; } sdres->tid = tid; sdres->sap = sap; INIT_HLIST_NODE(&sdres->node); return sdres; } struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, const char *uri, size_t uri_len) { struct nfc_llcp_sdp_tlv *sdreq; pr_debug("uri: %s, len: %zu\n", uri, uri_len); /* sdreq->tlv_len is u8, takes uri_len, + 3 for header, + 1 for NULL */ if (WARN_ON_ONCE(uri_len > U8_MAX - 4)) return NULL; sdreq = kzalloc(sizeof(struct nfc_llcp_sdp_tlv), GFP_KERNEL); if (sdreq == NULL) return NULL; sdreq->tlv_len = uri_len + 3; if (uri[uri_len - 1] == 0) sdreq->tlv_len--; sdreq->tlv = kzalloc(sdreq->tlv_len + 1, GFP_KERNEL); if (sdreq->tlv == NULL) { kfree(sdreq); return NULL; } sdreq->tlv[0] = LLCP_TLV_SDREQ; sdreq->tlv[1] = sdreq->tlv_len - 2; sdreq->tlv[2] = tid; sdreq->tid = tid; sdreq->uri = sdreq->tlv + 3; memcpy(sdreq->uri, uri, uri_len); sdreq->time = jiffies; INIT_HLIST_NODE(&sdreq->node); return sdreq; } void nfc_llcp_free_sdp_tlv(struct nfc_llcp_sdp_tlv *sdp) { kfree(sdp->tlv); kfree(sdp); } void nfc_llcp_free_sdp_tlv_list(struct hlist_head *head) { struct nfc_llcp_sdp_tlv *sdp; struct hlist_node *n; hlist_for_each_entry_safe(sdp, n, head, node) { hlist_del(&sdp->node); nfc_llcp_free_sdp_tlv(sdp); } } int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local, const u8 *tlv_array, u16 tlv_array_len) { const u8 *tlv = tlv_array; u8 type, length, offset = 0; pr_debug("TLV array length %d\n", tlv_array_len); if (local == NULL) return -ENODEV; while (offset < tlv_array_len) { type = tlv[0]; length = tlv[1]; pr_debug("type 0x%x length %d\n", type, length); switch (type) { case LLCP_TLV_VERSION: local->remote_version = llcp_tlv_version(tlv); break; case LLCP_TLV_MIUX: local->remote_miu = llcp_tlv_miux(tlv) + 128; break; case LLCP_TLV_WKS: local->remote_wks = llcp_tlv_wks(tlv); break; case LLCP_TLV_LTO: local->remote_lto = llcp_tlv_lto(tlv) * 10; break; case LLCP_TLV_OPT: local->remote_opt = llcp_tlv_opt(tlv); break; default: pr_err("Invalid gt tlv value 0x%x\n", type); break; } offset += length + 2; tlv += length + 2; } pr_debug("version 0x%x miu %d lto %d opt 0x%x wks 0x%x\n", local->remote_version, local->remote_miu, local->remote_lto, local->remote_opt, local->remote_wks); return 0; } int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock, const u8 *tlv_array, u16 tlv_array_len) { const u8 *tlv = tlv_array; u8 type, length, offset = 0; pr_debug("TLV array length %d\n", tlv_array_len); if (sock == NULL) return -ENOTCONN; while (offset < tlv_array_len) { type = tlv[0]; length = tlv[1]; pr_debug("type 0x%x length %d\n", type, length); switch (type) { case LLCP_TLV_MIUX: sock->remote_miu = llcp_tlv_miux(tlv) + 128; break; case LLCP_TLV_RW: sock->remote_rw = llcp_tlv_rw(tlv); break; case LLCP_TLV_SN: break; default: pr_err("Invalid gt tlv value 0x%x\n", type); break; } offset += length + 2; tlv += length + 2; } pr_debug("sock %p rw %d miu %d\n", sock, sock->remote_rw, sock->remote_miu); return 0; } static struct sk_buff *llcp_add_header(struct sk_buff *pdu, u8 dsap, u8 ssap, u8 ptype) { u8 header[2]; pr_debug("ptype 0x%x dsap 0x%x ssap 0x%x\n", ptype, dsap, ssap); header[0] = (u8)((dsap << 2) | (ptype >> 2)); header[1] = (u8)((ptype << 6) | ssap); pr_debug("header 0x%x 0x%x\n", header[0], header[1]); skb_put_data(pdu, header, LLCP_HEADER_SIZE); return pdu; } static struct sk_buff *llcp_add_tlv(struct sk_buff *pdu, const u8 *tlv, u8 tlv_length) { /* XXX Add an skb length check */ if (tlv == NULL) return NULL; skb_put_data(pdu, tlv, tlv_length); return pdu; } static struct sk_buff *llcp_allocate_pdu(struct nfc_llcp_sock *sock, u8 cmd, u16 size) { struct sk_buff *skb; int err; if (sock->ssap == 0) return NULL; skb = nfc_alloc_send_skb(sock->dev, &sock->sk, MSG_DONTWAIT, size + LLCP_HEADER_SIZE, &err); if (skb == NULL) { pr_err("Could not allocate PDU\n"); return NULL; } skb = llcp_add_header(skb, sock->dsap, sock->ssap, cmd); return skb; } int nfc_llcp_send_disconnect(struct nfc_llcp_sock *sock) { struct sk_buff *skb; struct nfc_dev *dev; struct nfc_llcp_local *local; local = sock->local; if (local == NULL) return -ENODEV; dev = sock->dev; if (dev == NULL) return -ENODEV; skb = llcp_allocate_pdu(sock, LLCP_PDU_DISC, 0); if (skb == NULL) return -ENOMEM; skb_queue_tail(&local->tx_queue, skb); return 0; } int nfc_llcp_send_symm(struct nfc_dev *dev) { struct sk_buff *skb; struct nfc_llcp_local *local; u16 size = 0; int err; local = nfc_llcp_find_local(dev); if (local == NULL) return -ENODEV; size += LLCP_HEADER_SIZE; size += dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE; skb = alloc_skb(size, GFP_KERNEL); if (skb == NULL) { err = -ENOMEM; goto out; } skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE); skb = llcp_add_header(skb, 0, 0, LLCP_PDU_SYMM); __net_timestamp(skb); nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_TX); err = nfc_data_exchange(dev, local->target_idx, skb, nfc_llcp_recv, local); out: nfc_llcp_local_put(local); return err; } int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local; struct sk_buff *skb; const u8 *service_name_tlv = NULL; const u8 *miux_tlv = NULL; const u8 *rw_tlv = NULL; u8 service_name_tlv_length = 0; u8 miux_tlv_length, rw_tlv_length, rw; int err; u16 size = 0; __be16 miux; local = sock->local; if (local == NULL) return -ENODEV; if (sock->service_name != NULL) { service_name_tlv = nfc_llcp_build_tlv(LLCP_TLV_SN, sock->service_name, sock->service_name_len, &service_name_tlv_length); if (!service_name_tlv) { err = -ENOMEM; goto error_tlv; } size += service_name_tlv_length; } /* If the socket parameters are not set, use the local ones */ miux = be16_to_cpu(sock->miux) > LLCP_MAX_MIUX ? local->miux : sock->miux; rw = sock->rw > LLCP_MAX_RW ? local->rw : sock->rw; miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&miux, 0, &miux_tlv_length); if (!miux_tlv) { err = -ENOMEM; goto error_tlv; } size += miux_tlv_length; rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &rw, 0, &rw_tlv_length); if (!rw_tlv) { err = -ENOMEM; goto error_tlv; } size += rw_tlv_length; pr_debug("SKB size %d SN length %zu\n", size, sock->service_name_len); skb = llcp_allocate_pdu(sock, LLCP_PDU_CONNECT, size); if (skb == NULL) { err = -ENOMEM; goto error_tlv; } llcp_add_tlv(skb, service_name_tlv, service_name_tlv_length); llcp_add_tlv(skb, miux_tlv, miux_tlv_length); llcp_add_tlv(skb, rw_tlv, rw_tlv_length); skb_queue_tail(&local->tx_queue, skb); err = 0; error_tlv: if (err) pr_err("error %d\n", err); kfree(service_name_tlv); kfree(miux_tlv); kfree(rw_tlv); return err; } int nfc_llcp_send_cc(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local; struct sk_buff *skb; const u8 *miux_tlv = NULL; const u8 *rw_tlv = NULL; u8 miux_tlv_length, rw_tlv_length, rw; int err; u16 size = 0; __be16 miux; local = sock->local; if (local == NULL) return -ENODEV; /* If the socket parameters are not set, use the local ones */ miux = be16_to_cpu(sock->miux) > LLCP_MAX_MIUX ? local->miux : sock->miux; rw = sock->rw > LLCP_MAX_RW ? local->rw : sock->rw; miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&miux, 0, &miux_tlv_length); if (!miux_tlv) { err = -ENOMEM; goto error_tlv; } size += miux_tlv_length; rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &rw, 0, &rw_tlv_length); if (!rw_tlv) { err = -ENOMEM; goto error_tlv; } size += rw_tlv_length; skb = llcp_allocate_pdu(sock, LLCP_PDU_CC, size); if (skb == NULL) { err = -ENOMEM; goto error_tlv; } llcp_add_tlv(skb, miux_tlv, miux_tlv_length); llcp_add_tlv(skb, rw_tlv, rw_tlv_length); skb_queue_tail(&local->tx_queue, skb); err = 0; error_tlv: if (err) pr_err("error %d\n", err); kfree(miux_tlv); kfree(rw_tlv); return err; } static struct sk_buff *nfc_llcp_allocate_snl(struct nfc_llcp_local *local, size_t tlv_length) { struct sk_buff *skb; struct nfc_dev *dev; u16 size = 0; if (local == NULL) return ERR_PTR(-ENODEV); dev = local->dev; if (dev == NULL) return ERR_PTR(-ENODEV); size += LLCP_HEADER_SIZE; size += dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE; size += tlv_length; skb = alloc_skb(size, GFP_KERNEL); if (skb == NULL) return ERR_PTR(-ENOMEM); skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE); skb = llcp_add_header(skb, LLCP_SAP_SDP, LLCP_SAP_SDP, LLCP_PDU_SNL); return skb; } int nfc_llcp_send_snl_sdres(struct nfc_llcp_local *local, struct hlist_head *tlv_list, size_t tlvs_len) { struct nfc_llcp_sdp_tlv *sdp; struct hlist_node *n; struct sk_buff *skb; skb = nfc_llcp_allocate_snl(local, tlvs_len); if (IS_ERR(skb)) return PTR_ERR(skb); hlist_for_each_entry_safe(sdp, n, tlv_list, node) { skb_put_data(skb, sdp->tlv, sdp->tlv_len); hlist_del(&sdp->node); nfc_llcp_free_sdp_tlv(sdp); } skb_queue_tail(&local->tx_queue, skb); return 0; } int nfc_llcp_send_snl_sdreq(struct nfc_llcp_local *local, struct hlist_head *tlv_list, size_t tlvs_len) { struct nfc_llcp_sdp_tlv *sdreq; struct hlist_node *n; struct sk_buff *skb; skb = nfc_llcp_allocate_snl(local, tlvs_len); if (IS_ERR(skb)) return PTR_ERR(skb); mutex_lock(&local->sdreq_lock); if (hlist_empty(&local->pending_sdreqs)) mod_timer(&local->sdreq_timer, jiffies + msecs_to_jiffies(3 * local->remote_lto)); hlist_for_each_entry_safe(sdreq, n, tlv_list, node) { pr_debug("tid %d for %s\n", sdreq->tid, sdreq->uri); skb_put_data(skb, sdreq->tlv, sdreq->tlv_len); hlist_del(&sdreq->node); hlist_add_head(&sdreq->node, &local->pending_sdreqs); } mutex_unlock(&local->sdreq_lock); skb_queue_tail(&local->tx_queue, skb); return 0; } int nfc_llcp_send_dm(struct nfc_llcp_local *local, u8 ssap, u8 dsap, u8 reason) { struct sk_buff *skb; struct nfc_dev *dev; u16 size = 1; /* Reason code */ pr_debug("Sending DM reason 0x%x\n", reason); if (local == NULL) return -ENODEV; dev = local->dev; if (dev == NULL) return -ENODEV; size += LLCP_HEADER_SIZE; size += dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE; skb = alloc_skb(size, GFP_KERNEL); if (skb == NULL) return -ENOMEM; skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE); skb = llcp_add_header(skb, dsap, ssap, LLCP_PDU_DM); skb_put_data(skb, &reason, 1); skb_queue_head(&local->tx_queue, skb); return 0; } int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock, struct msghdr *msg, size_t len) { struct sk_buff *pdu; struct sock *sk = &sock->sk; struct nfc_llcp_local *local; size_t frag_len = 0, remaining_len; u8 *msg_data, *msg_ptr; u16 remote_miu; pr_debug("Send I frame len %zd\n", len); local = sock->local; if (local == NULL) return -ENODEV; /* Remote is ready but has not acknowledged our frames */ if((sock->remote_ready && skb_queue_len(&sock->tx_pending_queue) >= sock->remote_rw && skb_queue_len(&sock->tx_queue) >= 2 * sock->remote_rw)) { pr_err("Pending queue is full %d frames\n", skb_queue_len(&sock->tx_pending_queue)); return -ENOBUFS; } /* Remote is not ready and we've been queueing enough frames */ if ((!sock->remote_ready && skb_queue_len(&sock->tx_queue) >= 2 * sock->remote_rw)) { pr_err("Tx queue is full %d frames\n", skb_queue_len(&sock->tx_queue)); return -ENOBUFS; } msg_data = kmalloc(len, GFP_USER | __GFP_NOWARN); if (msg_data == NULL) return -ENOMEM; if (memcpy_from_msg(msg_data, msg, len)) { kfree(msg_data); return -EFAULT; } remaining_len = len; msg_ptr = msg_data; do { remote_miu = sock->remote_miu > LLCP_MAX_MIU ? LLCP_DEFAULT_MIU : sock->remote_miu; frag_len = min_t(size_t, remote_miu, remaining_len); pr_debug("Fragment %zd bytes remaining %zd", frag_len, remaining_len); pdu = llcp_allocate_pdu(sock, LLCP_PDU_I, frag_len + LLCP_SEQUENCE_SIZE); if (pdu == NULL) { kfree(msg_data); return -ENOMEM; } skb_put(pdu, LLCP_SEQUENCE_SIZE); if (likely(frag_len > 0)) skb_put_data(pdu, msg_ptr, frag_len); skb_queue_tail(&sock->tx_queue, pdu); lock_sock(sk); nfc_llcp_queue_i_frames(sock); release_sock(sk); remaining_len -= frag_len; msg_ptr += frag_len; } while (remaining_len > 0); kfree(msg_data); return len; } int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap, struct msghdr *msg, size_t len) { struct sk_buff *pdu; struct nfc_llcp_local *local; size_t frag_len = 0, remaining_len; u8 *msg_ptr, *msg_data; u16 remote_miu; int err; pr_debug("Send UI frame len %zd\n", len); local = sock->local; if (local == NULL) return -ENODEV; msg_data = kmalloc(len, GFP_USER | __GFP_NOWARN); if (msg_data == NULL) return -ENOMEM; if (memcpy_from_msg(msg_data, msg, len)) { kfree(msg_data); return -EFAULT; } remaining_len = len; msg_ptr = msg_data; do { remote_miu = sock->remote_miu > LLCP_MAX_MIU ? local->remote_miu : sock->remote_miu; frag_len = min_t(size_t, remote_miu, remaining_len); pr_debug("Fragment %zd bytes remaining %zd", frag_len, remaining_len); pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, 0, frag_len + LLCP_HEADER_SIZE, &err); if (pdu == NULL) { pr_err("Could not allocate PDU (error=%d)\n", err); len -= remaining_len; if (len == 0) len = err; break; } pdu = llcp_add_header(pdu, dsap, ssap, LLCP_PDU_UI); if (likely(frag_len > 0)) skb_put_data(pdu, msg_ptr, frag_len); /* No need to check for the peer RW for UI frames */ skb_queue_tail(&local->tx_queue, pdu); remaining_len -= frag_len; msg_ptr += frag_len; } while (remaining_len > 0); kfree(msg_data); return len; } int nfc_llcp_send_rr(struct nfc_llcp_sock *sock) { struct sk_buff *skb; struct nfc_llcp_local *local; pr_debug("Send rr nr %d\n", sock->recv_n); local = sock->local; if (local == NULL) return -ENODEV; skb = llcp_allocate_pdu(sock, LLCP_PDU_RR, LLCP_SEQUENCE_SIZE); if (skb == NULL) return -ENOMEM; skb_put(skb, LLCP_SEQUENCE_SIZE); skb->data[2] = sock->recv_n; skb_queue_head(&local->tx_queue, skb); return 0; }
3 3 3 3 1 1 3 1 3 3 3 3 3 3 3 1 1 1 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2007 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/gen_stats.h> #include <linux/jhash.h> #include <linux/rtnetlink.h> #include <linux/random.h> #include <linux/slab.h> #include <net/gen_stats.h> #include <net/netlink.h> #include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_RATEEST.h> #include <net/netfilter/xt_rateest.h> #define RATEEST_HSIZE 16 struct xt_rateest_net { struct mutex hash_lock; struct hlist_head hash[RATEEST_HSIZE]; }; static unsigned int xt_rateest_id; static unsigned int jhash_rnd __read_mostly; static unsigned int xt_rateest_hash(const char *name) { return jhash(name, sizeof_field(struct xt_rateest, name), jhash_rnd) & (RATEEST_HSIZE - 1); } static void xt_rateest_hash_insert(struct xt_rateest_net *xn, struct xt_rateest *est) { unsigned int h; h = xt_rateest_hash(est->name); hlist_add_head(&est->list, &xn->hash[h]); } static struct xt_rateest *__xt_rateest_lookup(struct xt_rateest_net *xn, const char *name) { struct xt_rateest *est; unsigned int h; h = xt_rateest_hash(name); hlist_for_each_entry(est, &xn->hash[h], list) { if (strcmp(est->name, name) == 0) { est->refcnt++; return est; } } return NULL; } struct xt_rateest *xt_rateest_lookup(struct net *net, const char *name) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); struct xt_rateest *est; mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, name); mutex_unlock(&xn->hash_lock); return est; } EXPORT_SYMBOL_GPL(xt_rateest_lookup); void xt_rateest_put(struct net *net, struct xt_rateest *est) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); mutex_lock(&xn->hash_lock); if (--est->refcnt == 0) { hlist_del(&est->list); gen_kill_estimator(&est->rate_est); /* * gen_estimator est_timer() might access est->lock or bstats, * wait a RCU grace period before freeing 'est' */ kfree_rcu(est, rcu); } mutex_unlock(&xn->hash_lock); } EXPORT_SYMBOL_GPL(xt_rateest_put); static unsigned int xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_rateest_target_info *info = par->targinfo; struct gnet_stats_basic_sync *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); u64_stats_add(&stats->bytes, skb->len); u64_stats_inc(&stats->packets); spin_unlock_bh(&info->est->lock); return XT_CONTINUE; } static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) { struct xt_rateest_net *xn = net_generic(par->net, xt_rateest_id); struct xt_rateest_target_info *info = par->targinfo; struct xt_rateest *est; struct { struct nlattr opt; struct gnet_estimator est; } cfg; int ret; if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name)) return -ENAMETOOLONG; net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, info->name); if (est) { mutex_unlock(&xn->hash_lock); /* * If estimator parameters are specified, they must match the * existing estimator. */ if ((!info->interval && !info->ewma_log) || (info->interval != est->params.interval || info->ewma_log != est->params.ewma_log)) { xt_rateest_put(par->net, est); return -EINVAL; } info->est = est; return 0; } ret = -ENOMEM; est = kzalloc(sizeof(*est), GFP_KERNEL); if (!est) goto err1; gnet_stats_basic_sync_init(&est->bstats); strscpy(est->name, info->name, sizeof(est->name)); spin_lock_init(&est->lock); est->refcnt = 1; est->params.interval = info->interval; est->params.ewma_log = info->ewma_log; cfg.opt.nla_len = nla_attr_size(sizeof(cfg.est)); cfg.opt.nla_type = TCA_STATS_RATE_EST; cfg.est.interval = info->interval; cfg.est.ewma_log = info->ewma_log; ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est, &est->lock, NULL, &cfg.opt); if (ret < 0) goto err2; info->est = est; xt_rateest_hash_insert(xn, est); mutex_unlock(&xn->hash_lock); return 0; err2: kfree(est); err1: mutex_unlock(&xn->hash_lock); return ret; } static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) { struct xt_rateest_target_info *info = par->targinfo; xt_rateest_put(par->net, info->est); } static struct xt_target xt_rateest_tg_reg __read_mostly = { .name = "RATEEST", .revision = 0, .family = NFPROTO_UNSPEC, .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }; static __net_init int xt_rateest_net_init(struct net *net) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); int i; mutex_init(&xn->hash_lock); for (i = 0; i < ARRAY_SIZE(xn->hash); i++) INIT_HLIST_HEAD(&xn->hash[i]); return 0; } static struct pernet_operations xt_rateest_net_ops = { .init = xt_rateest_net_init, .id = &xt_rateest_id, .size = sizeof(struct xt_rateest_net), }; static int __init xt_rateest_tg_init(void) { int err = register_pernet_subsys(&xt_rateest_net_ops); if (err) return err; return xt_register_target(&xt_rateest_tg_reg); } static void __exit xt_rateest_tg_fini(void) { xt_unregister_target(&xt_rateest_tg_reg); unregister_pernet_subsys(&xt_rateest_net_ops); } MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: packet rate estimator"); MODULE_ALIAS("ipt_RATEEST"); MODULE_ALIAS("ip6t_RATEEST"); module_init(xt_rateest_tg_init); module_exit(xt_rateest_tg_fini);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_BITOPS_FFS_H_ #define _ASM_GENERIC_BITOPS_FFS_H_ /** * generic_ffs - find first bit set * @x: the word to search * * This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from ffz (man ffs). */ static inline int generic_ffs(int x) { int r = 1; if (!x) return 0; if (!(x & 0xffff)) { x >>= 16; r += 16; } if (!(x & 0xff)) { x >>= 8; r += 8; } if (!(x & 0xf)) { x >>= 4; r += 4; } if (!(x & 3)) { x >>= 2; r += 2; } if (!(x & 1)) { x >>= 1; r += 1; } return r; } #ifndef __HAVE_ARCH_FFS #define ffs(x) generic_ffs(x) #endif #endif /* _ASM_GENERIC_BITOPS_FFS_H_ */
1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_VXLAN_H #define __NET_VXLAN_H 1 #include <linux/if_vlan.h> #include <linux/rhashtable-types.h> #include <net/udp_tunnel.h> #include <net/dst_metadata.h> #include <net/rtnetlink.h> #include <net/switchdev.h> #include <net/nexthop.h> #define IANA_VXLAN_UDP_PORT 4789 #define IANA_VXLAN_GPE_UDP_PORT 4790 /* VXLAN protocol (RFC 7348) header: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |R|R|R|R|I|R|R|R| Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | VXLAN Network Identifier (VNI) | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * I = VXLAN Network Identifier (VNI) present. */ struct vxlanhdr { __be32 vx_flags; __be32 vx_vni; }; /* VXLAN header flags. */ #define VXLAN_HF_VNI cpu_to_be32(BIT(27)) #define VXLAN_N_VID (1u << 24) #define VXLAN_VID_MASK (VXLAN_N_VID - 1) #define VXLAN_VNI_MASK cpu_to_be32(VXLAN_VID_MASK << 8) #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) #define VNI_HASH_BITS 10 #define VNI_HASH_SIZE (1<<VNI_HASH_BITS) #define FDB_HASH_BITS 8 #define FDB_HASH_SIZE (1<<FDB_HASH_BITS) /* Remote checksum offload for VXLAN (VXLAN_F_REMCSUM_[RT]X): * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |R|R|R|R|I|R|R|R|R|R|C| Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | VXLAN Network Identifier (VNI) |O| Csum start | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * C = Remote checksum offload bit. When set indicates that the * remote checksum offload data is present. * * O = Offset bit. Indicates the checksum offset relative to * checksum start. * * Csum start = Checksum start divided by two. * * http://tools.ietf.org/html/draft-herbert-vxlan-rco */ /* VXLAN-RCO header flags. */ #define VXLAN_HF_RCO cpu_to_be32(BIT(21)) /* Remote checksum offload header option */ #define VXLAN_RCO_MASK cpu_to_be32(0x7f) /* Last byte of vni field */ #define VXLAN_RCO_UDP cpu_to_be32(0x80) /* Indicate UDP RCO (TCP when not set *) */ #define VXLAN_RCO_SHIFT 1 /* Left shift of start */ #define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1) #define VXLAN_MAX_REMCSUM_START (0x7f << VXLAN_RCO_SHIFT) /* * VXLAN Group Based Policy Extension (VXLAN_F_GBP): * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |G|R|R|R|I|R|R|R|R|D|R|R|A|R|R|R| Group Policy ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | VXLAN Network Identifier (VNI) | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * G = Group Policy ID present. * * D = Don't Learn bit. When set, this bit indicates that the egress * VTEP MUST NOT learn the source address of the encapsulated frame. * * A = Indicates that the group policy has already been applied to * this packet. Policies MUST NOT be applied by devices when the * A bit is set. * * https://tools.ietf.org/html/draft-smith-vxlan-group-policy */ struct vxlanhdr_gbp { u8 vx_flags; #ifdef __LITTLE_ENDIAN_BITFIELD u8 reserved_flags1:3, policy_applied:1, reserved_flags2:2, dont_learn:1, reserved_flags3:1; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved_flags1:1, dont_learn:1, reserved_flags2:2, policy_applied:1, reserved_flags3:3; #else #error "Please fix <asm/byteorder.h>" #endif __be16 policy_id; __be32 vx_vni; }; /* VXLAN-GBP header flags. */ #define VXLAN_HF_GBP cpu_to_be32(BIT(31)) #define VXLAN_GBP_USED_BITS (VXLAN_HF_GBP | cpu_to_be32(0xFFFFFF)) /* skb->mark mapping * * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |R|R|R|R|R|R|R|R|R|D|R|R|A|R|R|R| Group Policy ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ #define VXLAN_GBP_DONT_LEARN (BIT(6) << 16) #define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16) #define VXLAN_GBP_ID_MASK (0xFFFF) #define VXLAN_GBP_MASK (VXLAN_GBP_DONT_LEARN | VXLAN_GBP_POLICY_APPLIED | \ VXLAN_GBP_ID_MASK) /* * VXLAN Generic Protocol Extension (VXLAN_F_GPE): * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |R|R|Ver|I|P|R|O| Reserved |Next Protocol | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | VXLAN Network Identifier (VNI) | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Ver = Version. Indicates VXLAN GPE protocol version. * * P = Next Protocol Bit. The P bit is set to indicate that the * Next Protocol field is present. * * O = OAM Flag Bit. The O bit is set to indicate that the packet * is an OAM packet. * * Next Protocol = This 8 bit field indicates the protocol header * immediately following the VXLAN GPE header. * * https://tools.ietf.org/html/draft-ietf-nvo3-vxlan-gpe-01 */ struct vxlanhdr_gpe { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 oam_flag:1, reserved_flags1:1, np_applied:1, instance_applied:1, version:2, reserved_flags2:2; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved_flags2:2, version:2, instance_applied:1, np_applied:1, reserved_flags1:1, oam_flag:1; #endif u8 reserved_flags3; u8 reserved_flags4; u8 next_protocol; __be32 vx_vni; }; /* VXLAN-GPE header flags. */ #define VXLAN_HF_VER cpu_to_be32(BIT(29) | BIT(28)) #define VXLAN_HF_NP cpu_to_be32(BIT(26)) #define VXLAN_HF_OAM cpu_to_be32(BIT(24)) #define VXLAN_GPE_USED_BITS (VXLAN_HF_VER | VXLAN_HF_NP | VXLAN_HF_OAM | \ cpu_to_be32(0xff)) struct vxlan_metadata { u32 gbp; }; /* per UDP socket information */ struct vxlan_sock { struct hlist_node hlist; struct socket *sock; struct hlist_head vni_list[VNI_HASH_SIZE]; refcount_t refcnt; u32 flags; }; union vxlan_addr { struct sockaddr_in sin; struct sockaddr_in6 sin6; struct sockaddr sa; }; struct vxlan_rdst { union vxlan_addr remote_ip; __be16 remote_port; u8 offloaded:1; __be32 remote_vni; u32 remote_ifindex; struct net_device *remote_dev; struct list_head list; struct rcu_head rcu; struct dst_cache dst_cache; }; struct vxlan_config { union vxlan_addr remote_ip; union vxlan_addr saddr; __be32 vni; int remote_ifindex; int mtu; __be16 dst_port; u16 port_min; u16 port_max; u8 tos; u8 ttl; __be32 label; enum ifla_vxlan_label_policy label_policy; u32 flags; unsigned long age_interval; unsigned int addrmax; bool no_share; enum ifla_vxlan_df df; }; enum { VXLAN_VNI_STATS_RX, VXLAN_VNI_STATS_RX_DROPS, VXLAN_VNI_STATS_RX_ERRORS, VXLAN_VNI_STATS_TX, VXLAN_VNI_STATS_TX_DROPS, VXLAN_VNI_STATS_TX_ERRORS, }; struct vxlan_vni_stats { u64 rx_packets; u64 rx_bytes; u64 rx_drops; u64 rx_errors; u64 tx_packets; u64 tx_bytes; u64 tx_drops; u64 tx_errors; }; struct vxlan_vni_stats_pcpu { struct vxlan_vni_stats stats; struct u64_stats_sync syncp; }; struct vxlan_dev_node { struct hlist_node hlist; struct vxlan_dev *vxlan; }; struct vxlan_vni_node { struct rhash_head vnode; struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */ #if IS_ENABLED(CONFIG_IPV6) struct vxlan_dev_node hlist6; /* vni hash table for IPv6 socket */ #endif struct list_head vlist; __be32 vni; union vxlan_addr remote_ip; /* default remote ip for this vni */ struct vxlan_vni_stats_pcpu __percpu *stats; struct rcu_head rcu; }; struct vxlan_vni_group { struct rhashtable vni_hash; struct list_head vni_list; u32 num_vnis; }; /* Pseudo network device */ struct vxlan_dev { struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */ #if IS_ENABLED(CONFIG_IPV6) struct vxlan_dev_node hlist6; /* vni hash table for IPv6 socket */ #endif struct list_head next; /* vxlan's per namespace list */ struct vxlan_sock __rcu *vn4_sock; /* listening socket for IPv4 */ #if IS_ENABLED(CONFIG_IPV6) struct vxlan_sock __rcu *vn6_sock; /* listening socket for IPv6 */ #endif struct net_device *dev; struct net *net; /* netns for packet i/o */ struct vxlan_rdst default_dst; /* default destination */ struct timer_list age_timer; spinlock_t hash_lock[FDB_HASH_SIZE]; unsigned int addrcnt; struct gro_cells gro_cells; struct vxlan_config cfg; struct vxlan_vni_group __rcu *vnigrp; struct hlist_head fdb_head[FDB_HASH_SIZE]; struct rhashtable mdb_tbl; struct hlist_head mdb_list; unsigned int mdb_seq; }; #define VXLAN_F_LEARN 0x01 #define VXLAN_F_PROXY 0x02 #define VXLAN_F_RSC 0x04 #define VXLAN_F_L2MISS 0x08 #define VXLAN_F_L3MISS 0x10 #define VXLAN_F_IPV6 0x20 #define VXLAN_F_UDP_ZERO_CSUM_TX 0x40 #define VXLAN_F_UDP_ZERO_CSUM6_TX 0x80 #define VXLAN_F_UDP_ZERO_CSUM6_RX 0x100 #define VXLAN_F_REMCSUM_TX 0x200 #define VXLAN_F_REMCSUM_RX 0x400 #define VXLAN_F_GBP 0x800 #define VXLAN_F_REMCSUM_NOPARTIAL 0x1000 #define VXLAN_F_COLLECT_METADATA 0x2000 #define VXLAN_F_GPE 0x4000 #define VXLAN_F_IPV6_LINKLOCAL 0x8000 #define VXLAN_F_TTL_INHERIT 0x10000 #define VXLAN_F_VNIFILTER 0x20000 #define VXLAN_F_MDB 0x40000 #define VXLAN_F_LOCALBYPASS 0x80000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable */ #define VXLAN_F_RCV_FLAGS (VXLAN_F_GBP | \ VXLAN_F_GPE | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_REMCSUM_RX | \ VXLAN_F_REMCSUM_NOPARTIAL | \ VXLAN_F_COLLECT_METADATA | \ VXLAN_F_VNIFILTER) /* Flags that can be set together with VXLAN_F_GPE. */ #define VXLAN_F_ALLOWED_GPE (VXLAN_F_GPE | \ VXLAN_F_IPV6 | \ VXLAN_F_IPV6_LINKLOCAL | \ VXLAN_F_UDP_ZERO_CSUM_TX | \ VXLAN_F_UDP_ZERO_CSUM6_TX | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_COLLECT_METADATA | \ VXLAN_F_VNIFILTER | \ VXLAN_F_LOCALBYPASS) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, netdev_features_t features) { u8 l4_hdr = 0; if (!skb->encapsulation) return features; switch (vlan_get_protocol(skb)) { case htons(ETH_P_IP): l4_hdr = ip_hdr(skb)->protocol; break; case htons(ETH_P_IPV6): l4_hdr = ipv6_hdr(skb)->nexthdr; break; default: return features; } if ((l4_hdr == IPPROTO_UDP) && (skb->inner_protocol_type != ENCAP_TYPE_ETHER || skb->inner_protocol != htons(ETH_P_TEB) || (skb_inner_mac_header(skb) - skb_transport_header(skb) != sizeof(struct udphdr) + sizeof(struct vxlanhdr)) || (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, inner_eth_hdr(skb)->h_proto)))) return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); return features; } static inline int vxlan_headroom(u32 flags) { /* VXLAN: IP4/6 header + UDP + VXLAN + Ethernet header */ /* VXLAN-GPE: IP4/6 header + UDP + VXLAN */ return (flags & VXLAN_F_IPV6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr)) + sizeof(struct udphdr) + sizeof(struct vxlanhdr) + (flags & VXLAN_F_GPE ? 0 : ETH_HLEN); } static inline struct vxlanhdr *vxlan_hdr(struct sk_buff *skb) { return (struct vxlanhdr *)(udp_hdr(skb) + 1); } static inline __be32 vxlan_vni(__be32 vni_field) { #if defined(__BIG_ENDIAN) return (__force __be32)((__force u32)vni_field >> 8); #else return (__force __be32)((__force u32)(vni_field & VXLAN_VNI_MASK) << 8); #endif } static inline __be32 vxlan_vni_field(__be32 vni) { #if defined(__BIG_ENDIAN) return (__force __be32)((__force u32)vni << 8); #else return (__force __be32)((__force u32)vni >> 8); #endif } static inline size_t vxlan_rco_start(__be32 vni_field) { return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; } static inline size_t vxlan_rco_offset(__be32 vni_field) { return (vni_field & VXLAN_RCO_UDP) ? offsetof(struct udphdr, check) : offsetof(struct tcphdr, check); } static inline __be32 vxlan_compute_rco(unsigned int start, unsigned int offset) { __be32 vni_field = cpu_to_be32(start >> VXLAN_RCO_SHIFT); if (offset == offsetof(struct udphdr, check)) vni_field |= VXLAN_RCO_UDP; return vni_field; } static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs) { return vs->sock->sk->sk_family; } #if IS_ENABLED(CONFIG_IPV6) static inline bool vxlan_addr_any(const union vxlan_addr *ipa) { if (ipa->sa.sa_family == AF_INET6) return ipv6_addr_any(&ipa->sin6.sin6_addr); else return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); } static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) { if (ipa->sa.sa_family == AF_INET6) return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr); else return ipv4_is_multicast(ipa->sin.sin_addr.s_addr); } #else /* !IS_ENABLED(CONFIG_IPV6) */ static inline bool vxlan_addr_any(const union vxlan_addr *ipa) { return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); } static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) { return ipv4_is_multicast(ipa->sin.sin_addr.s_addr); } #endif /* IS_ENABLED(CONFIG_IPV6) */ static inline bool netif_is_vxlan(const struct net_device *dev) { return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "vxlan"); } struct switchdev_notifier_vxlan_fdb_info { struct switchdev_notifier_info info; /* must be first */ union vxlan_addr remote_ip; __be16 remote_port; __be32 remote_vni; u32 remote_ifindex; u8 eth_addr[ETH_ALEN]; __be32 vni; bool offloaded; bool added_by_user; }; #if IS_ENABLED(CONFIG_VXLAN) int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, struct switchdev_notifier_vxlan_fdb_info *fdb_info); int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, struct notifier_block *nb, struct netlink_ext_ack *extack); void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni); #else static inline int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { return -ENOENT; } static inline int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, struct notifier_block *nb, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) { } #endif static inline void vxlan_flag_attr_error(int attrtype, struct netlink_ext_ack *extack) { #define VXLAN_FLAG(flg) \ case IFLA_VXLAN_##flg: \ NL_SET_ERR_MSG_MOD(extack, \ "cannot change " #flg " flag"); \ break switch (attrtype) { VXLAN_FLAG(TTL_INHERIT); VXLAN_FLAG(LEARNING); VXLAN_FLAG(PROXY); VXLAN_FLAG(RSC); VXLAN_FLAG(L2MISS); VXLAN_FLAG(L3MISS); VXLAN_FLAG(COLLECT_METADATA); VXLAN_FLAG(UDP_ZERO_CSUM6_TX); VXLAN_FLAG(UDP_ZERO_CSUM6_RX); VXLAN_FLAG(REMCSUM_TX); VXLAN_FLAG(REMCSUM_RX); VXLAN_FLAG(GBP); VXLAN_FLAG(GPE); VXLAN_FLAG(REMCSUM_NOPARTIAL); default: NL_SET_ERR_MSG_MOD(extack, \ "cannot change flag"); break; } #undef VXLAN_FLAG } static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh, u32 hash, struct vxlan_rdst *rdst) { struct fib_nh_common *nhc; nhc = nexthop_path_fdb_result(nh, hash >> 1); if (unlikely(!nhc)) return false; switch (nhc->nhc_gw_family) { case AF_INET: rdst->remote_ip.sin.sin_addr.s_addr = nhc->nhc_gw.ipv4; rdst->remote_ip.sa.sa_family = AF_INET; break; case AF_INET6: rdst->remote_ip.sin6.sin6_addr = nhc->nhc_gw.ipv6; rdst->remote_ip.sa.sa_family = AF_INET6; break; } return true; } static inline void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, const struct vxlan_metadata *md) { struct vxlanhdr_gbp *gbp; if (!md->gbp) return; gbp = (struct vxlanhdr_gbp *)vxh; vxh->vx_flags |= VXLAN_HF_GBP; if (md->gbp & VXLAN_GBP_DONT_LEARN) gbp->dont_learn = 1; if (md->gbp & VXLAN_GBP_POLICY_APPLIED) gbp->policy_applied = 1; gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK); } #endif
18 76 76 76 76 73 72 33 72 25 72 74 25 73 67 13 72 69 67 67 64 3 67 72 77 77 5 5 5 74 74 71 73 77 81 65 76 76 76 76 76 4 2 2 4 65 65 52 52 65 55 55 2 2 55 55 55 37 37 37 4 37 37 55 37 55 2 2 2 2 2 2 65 65 2 2 64 65 65 3 65 65 65 65 65 2 4 76 76 53 76 71 5 4 5 71 71 15 65 65 65 65 55 3 4 62 69 53 51 53 52 2 52 52 5 53 51 51 51 3 3 53 53 53 46 46 46 46 48 48 48 61 61 61 1 1 1 61 58 45 45 58 61 61 48 61 1 1 1 1 60 2 59 58 60 60 59 46 56 60 8 8 4 8 8 1 7 8 4 8 1 1 8 8 8 8 8 4 9 9 8 61 61 61 60 61 52 52 52 9 52 52 52 51 27 48 48 48 27 27 1 48 48 48 48 48 48 48 46 48 48 2 46 46 46 46 46 46 2 46 2 2 2 1 48 13 47 46 47 47 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 49 46 46 6 6 46 56 56 56 56 10 49 49 49 49 49 1 1 1 48 48 48 48 48 5 47 48 56 48 5 48 48 48 50 6 55 55 56 56 6 4 4 4 4 4 6 13 10 10 13 13 13 13 13 13 13 13 9 13 13 13 13 2 13 2 2 11 10 10 10 10 10 10 10 11 1 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 /* FUSE: Filesystem in Userspace Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. */ #include "fuse_i.h" #include <linux/init.h> #include <linux/module.h> #include <linux/poll.h> #include <linux/sched/signal.h> #include <linux/uio.h> #include <linux/miscdevice.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/pipe_fs_i.h> #include <linux/swap.h> #include <linux/splice.h> #include <linux/sched.h> MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); /* Ordinary requests have even IDs, while interrupts IDs are odd */ #define FUSE_INT_REQ_BIT (1ULL << 0) #define FUSE_REQ_ID_STEP (1ULL << 1) static struct kmem_cache *fuse_req_cachep; static struct fuse_dev *fuse_get_dev(struct file *file) { /* * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ return READ_ONCE(file->private_data); } static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) { INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); refcount_set(&req->count, 1); __set_bit(FR_PENDING, &req->flags); req->fm = fm; } static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags) { struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); if (req) fuse_request_init(fm, req); return req; } static void fuse_request_free(struct fuse_req *req) { kmem_cache_free(fuse_req_cachep, req); } static void __fuse_get_request(struct fuse_req *req) { refcount_inc(&req->count); } /* Must be called with > 1 refcount */ static void __fuse_put_request(struct fuse_req *req) { refcount_dec(&req->count); } void fuse_set_initialized(struct fuse_conn *fc) { /* Make sure stores before this are seen on another CPU */ smp_wmb(); fc->initialized = 1; } static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) { return !fc->initialized || (for_background && fc->blocked); } static void fuse_drop_waiting(struct fuse_conn *fc) { /* * lockess check of fc->connected is okay, because atomic_dec_and_test() * provides a memory barrier matched with the one in fuse_wait_aborted() * to ensure no wake-up is missed. */ if (atomic_dec_and_test(&fc->num_waiting) && !READ_ONCE(fc->connected)) { /* wake up aborters */ wake_up_all(&fc->blocked_waitq); } } static void fuse_put_request(struct fuse_req *req); static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background) { struct fuse_conn *fc = fm->fc; struct fuse_req *req; int err; atomic_inc(&fc->num_waiting); if (fuse_block_alloc(fc, for_background)) { err = -EINTR; if (wait_event_killable_exclusive(fc->blocked_waitq, !fuse_block_alloc(fc, for_background))) goto out; } /* Matches smp_wmb() in fuse_set_initialized() */ smp_rmb(); err = -ENOTCONN; if (!fc->connected) goto out; err = -ECONNREFUSED; if (fc->conn_error) goto out; req = fuse_request_alloc(fm, GFP_KERNEL); err = -ENOMEM; if (!req) { if (for_background) wake_up(&fc->blocked_waitq); goto out; } req->in.h.uid = from_kuid(fc->user_ns, current_fsuid()); req->in.h.gid = from_kgid(fc->user_ns, current_fsgid()); req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); __set_bit(FR_WAITING, &req->flags); if (for_background) __set_bit(FR_BACKGROUND, &req->flags); if (unlikely(req->in.h.uid == ((uid_t)-1) || req->in.h.gid == ((gid_t)-1))) { fuse_put_request(req); return ERR_PTR(-EOVERFLOW); } return req; out: fuse_drop_waiting(fc); return ERR_PTR(err); } static void fuse_put_request(struct fuse_req *req) { struct fuse_conn *fc = req->fm->fc; if (refcount_dec_and_test(&req->count)) { if (test_bit(FR_BACKGROUND, &req->flags)) { /* * We get here in the unlikely case that a background * request was allocated but not sent */ spin_lock(&fc->bg_lock); if (!fc->blocked) wake_up(&fc->blocked_waitq); spin_unlock(&fc->bg_lock); } if (test_bit(FR_WAITING, &req->flags)) { __clear_bit(FR_WAITING, &req->flags); fuse_drop_waiting(fc); } fuse_request_free(req); } } unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) { unsigned nbytes = 0; unsigned i; for (i = 0; i < numargs; i++) nbytes += args[i].size; return nbytes; } EXPORT_SYMBOL_GPL(fuse_len_args); u64 fuse_get_unique(struct fuse_iqueue *fiq) { fiq->reqctr += FUSE_REQ_ID_STEP; return fiq->reqctr; } EXPORT_SYMBOL_GPL(fuse_get_unique); static unsigned int fuse_req_hash(u64 unique) { return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } /* * A new request is available, wake fiq->waitq */ static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq) __releases(fiq->lock) { wake_up(&fiq->waitq); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); spin_unlock(&fiq->lock); } const struct fuse_iqueue_ops fuse_dev_fiq_ops = { .wake_forget_and_unlock = fuse_dev_wake_and_unlock, .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock, .wake_pending_and_unlock = fuse_dev_wake_and_unlock, }; EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops); static void queue_request_and_unlock(struct fuse_iqueue *fiq, struct fuse_req *req) __releases(fiq->lock) { req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); list_add_tail(&req->list, &fiq->pending); fiq->ops->wake_pending_and_unlock(fiq); } void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, u64 nodeid, u64 nlookup) { struct fuse_iqueue *fiq = &fc->iq; forget->forget_one.nodeid = nodeid; forget->forget_one.nlookup = nlookup; spin_lock(&fiq->lock); if (fiq->connected) { fiq->forget_list_tail->next = forget; fiq->forget_list_tail = forget; fiq->ops->wake_forget_and_unlock(fiq); } else { kfree(forget); spin_unlock(&fiq->lock); } } static void flush_bg_queue(struct fuse_conn *fc) { struct fuse_iqueue *fiq = &fc->iq; while (fc->active_background < fc->max_background && !list_empty(&fc->bg_queue)) { struct fuse_req *req; req = list_first_entry(&fc->bg_queue, struct fuse_req, list); list_del(&req->list); fc->active_background++; spin_lock(&fiq->lock); req->in.h.unique = fuse_get_unique(fiq); queue_request_and_unlock(fiq, req); } } /* * This function is called when a request is finished. Either a reply * has arrived or it was aborted (and not yet sent) or some error * occurred during communication with userspace, or the device file * was closed. The requester thread is woken up (if still waiting), * the 'end' callback is called if given, else the reference to the * request is released */ void fuse_request_end(struct fuse_req *req) { struct fuse_mount *fm = req->fm; struct fuse_conn *fc = fm->fc; struct fuse_iqueue *fiq = &fc->iq; if (test_and_set_bit(FR_FINISHED, &req->flags)) goto put_request; /* * test_and_set_bit() implies smp_mb() between bit * changing and below FR_INTERRUPTED check. Pairs with * smp_mb() from queue_interrupt(). */ if (test_bit(FR_INTERRUPTED, &req->flags)) { spin_lock(&fiq->lock); list_del_init(&req->intr_entry); spin_unlock(&fiq->lock); } WARN_ON(test_bit(FR_PENDING, &req->flags)); WARN_ON(test_bit(FR_SENT, &req->flags)); if (test_bit(FR_BACKGROUND, &req->flags)) { spin_lock(&fc->bg_lock); clear_bit(FR_BACKGROUND, &req->flags); if (fc->num_background == fc->max_background) { fc->blocked = 0; wake_up(&fc->blocked_waitq); } else if (!fc->blocked) { /* * Wake up next waiter, if any. It's okay to use * waitqueue_active(), as we've already synced up * fc->blocked with waiters with the wake_up() call * above. */ if (waitqueue_active(&fc->blocked_waitq)) wake_up(&fc->blocked_waitq); } fc->num_background--; fc->active_background--; flush_bg_queue(fc); spin_unlock(&fc->bg_lock); } else { /* Wake up waiter sleeping in request_wait_answer() */ wake_up(&req->waitq); } if (test_bit(FR_ASYNC, &req->flags)) req->args->end(fm, req->args, req->out.h.error); put_request: fuse_put_request(req); } EXPORT_SYMBOL_GPL(fuse_request_end); static int queue_interrupt(struct fuse_req *req) { struct fuse_iqueue *fiq = &req->fm->fc->iq; spin_lock(&fiq->lock); /* Check for we've sent request to interrupt this req */ if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { spin_unlock(&fiq->lock); return -EINVAL; } if (list_empty(&req->intr_entry)) { list_add_tail(&req->intr_entry, &fiq->interrupts); /* * Pairs with smp_mb() implied by test_and_set_bit() * from fuse_request_end(). */ smp_mb(); if (test_bit(FR_FINISHED, &req->flags)) { list_del_init(&req->intr_entry); spin_unlock(&fiq->lock); return 0; } fiq->ops->wake_interrupt_and_unlock(fiq); } else { spin_unlock(&fiq->lock); } return 0; } static void request_wait_answer(struct fuse_req *req) { struct fuse_conn *fc = req->fm->fc; struct fuse_iqueue *fiq = &fc->iq; int err; if (!fc->no_interrupt) { /* Any signal may interrupt this */ err = wait_event_interruptible(req->waitq, test_bit(FR_FINISHED, &req->flags)); if (!err) return; set_bit(FR_INTERRUPTED, &req->flags); /* matches barrier in fuse_dev_do_read() */ smp_mb__after_atomic(); if (test_bit(FR_SENT, &req->flags)) queue_interrupt(req); } if (!test_bit(FR_FORCE, &req->flags)) { /* Only fatal signals may interrupt this */ err = wait_event_killable(req->waitq, test_bit(FR_FINISHED, &req->flags)); if (!err) return; spin_lock(&fiq->lock); /* Request is not yet in userspace, bail out */ if (test_bit(FR_PENDING, &req->flags)) { list_del(&req->list); spin_unlock(&fiq->lock); __fuse_put_request(req); req->out.h.error = -EINTR; return; } spin_unlock(&fiq->lock); } /* * Either request is already in userspace, or it was forced. * Wait it out. */ wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags)); } static void __fuse_request_send(struct fuse_req *req) { struct fuse_iqueue *fiq = &req->fm->fc->iq; BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); spin_lock(&fiq->lock); if (!fiq->connected) { spin_unlock(&fiq->lock); req->out.h.error = -ENOTCONN; } else { req->in.h.unique = fuse_get_unique(fiq); /* acquire extra reference, since request is still needed after fuse_request_end() */ __fuse_get_request(req); queue_request_and_unlock(fiq, req); request_wait_answer(req); /* Pairs with smp_wmb() in fuse_request_end() */ smp_rmb(); } } static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) { if (fc->minor < 4 && args->opcode == FUSE_STATFS) args->out_args[0].size = FUSE_COMPAT_STATFS_SIZE; if (fc->minor < 9) { switch (args->opcode) { case FUSE_LOOKUP: case FUSE_CREATE: case FUSE_MKNOD: case FUSE_MKDIR: case FUSE_SYMLINK: case FUSE_LINK: args->out_args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; break; case FUSE_GETATTR: case FUSE_SETATTR: args->out_args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; break; } } if (fc->minor < 12) { switch (args->opcode) { case FUSE_CREATE: args->in_args[0].size = sizeof(struct fuse_open_in); break; case FUSE_MKNOD: args->in_args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE; break; } } } static void fuse_force_creds(struct fuse_req *req) { struct fuse_conn *fc = req->fm->fc; req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); } static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) { req->in.h.opcode = args->opcode; req->in.h.nodeid = args->nodeid; req->args = args; if (args->is_ext) req->in.h.total_extlen = args->in_args[args->ext_idx].size / 8; if (args->end) __set_bit(FR_ASYNC, &req->flags); } ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) { struct fuse_conn *fc = fm->fc; struct fuse_req *req; ssize_t ret; if (args->force) { atomic_inc(&fc->num_waiting); req = fuse_request_alloc(fm, GFP_KERNEL | __GFP_NOFAIL); if (!args->nocreds) fuse_force_creds(req); __set_bit(FR_WAITING, &req->flags); __set_bit(FR_FORCE, &req->flags); } else { WARN_ON(args->nocreds); req = fuse_get_req(fm, false); if (IS_ERR(req)) return PTR_ERR(req); } /* Needs to be done after fuse_get_req() so that fc->minor is valid */ fuse_adjust_compat(fc, args); fuse_args_to_req(req, args); if (!args->noreply) __set_bit(FR_ISREPLY, &req->flags); __fuse_request_send(req); ret = req->out.h.error; if (!ret && args->out_argvar) { BUG_ON(args->out_numargs == 0); ret = args->out_args[args->out_numargs - 1].size; } fuse_put_request(req); return ret; } static bool fuse_request_queue_background(struct fuse_req *req) { struct fuse_mount *fm = req->fm; struct fuse_conn *fc = fm->fc; bool queued = false; WARN_ON(!test_bit(FR_BACKGROUND, &req->flags)); if (!test_bit(FR_WAITING, &req->flags)) { __set_bit(FR_WAITING, &req->flags); atomic_inc(&fc->num_waiting); } __set_bit(FR_ISREPLY, &req->flags); spin_lock(&fc->bg_lock); if (likely(fc->connected)) { fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); queued = true; } spin_unlock(&fc->bg_lock); return queued; } int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags) { struct fuse_req *req; if (args->force) { WARN_ON(!args->nocreds); req = fuse_request_alloc(fm, gfp_flags); if (!req) return -ENOMEM; __set_bit(FR_BACKGROUND, &req->flags); } else { WARN_ON(args->nocreds); req = fuse_get_req(fm, true); if (IS_ERR(req)) return PTR_ERR(req); } fuse_args_to_req(req, args); if (!fuse_request_queue_background(req)) { fuse_put_request(req); return -ENOTCONN; } return 0; } EXPORT_SYMBOL_GPL(fuse_simple_background); static int fuse_simple_notify_reply(struct fuse_mount *fm, struct fuse_args *args, u64 unique) { struct fuse_req *req; struct fuse_iqueue *fiq = &fm->fc->iq; int err = 0; req = fuse_get_req(fm, false); if (IS_ERR(req)) return PTR_ERR(req); __clear_bit(FR_ISREPLY, &req->flags); req->in.h.unique = unique; fuse_args_to_req(req, args); spin_lock(&fiq->lock); if (fiq->connected) { queue_request_and_unlock(fiq, req); } else { err = -ENODEV; spin_unlock(&fiq->lock); fuse_put_request(req); } return err; } /* * Lock the request. Up to the next unlock_request() there mustn't be * anything that could cause a page-fault. If the request was already * aborted bail out. */ static int lock_request(struct fuse_req *req) { int err = 0; if (req) { spin_lock(&req->waitq.lock); if (test_bit(FR_ABORTED, &req->flags)) err = -ENOENT; else set_bit(FR_LOCKED, &req->flags); spin_unlock(&req->waitq.lock); } return err; } /* * Unlock request. If it was aborted while locked, caller is responsible * for unlocking and ending the request. */ static int unlock_request(struct fuse_req *req) { int err = 0; if (req) { spin_lock(&req->waitq.lock); if (test_bit(FR_ABORTED, &req->flags)) err = -ENOENT; else clear_bit(FR_LOCKED, &req->flags); spin_unlock(&req->waitq.lock); } return err; } struct fuse_copy_state { int write; struct fuse_req *req; struct iov_iter *iter; struct pipe_buffer *pipebufs; struct pipe_buffer *currbuf; struct pipe_inode_info *pipe; unsigned long nr_segs; struct page *pg; unsigned len; unsigned offset; unsigned move_pages:1; }; static void fuse_copy_init(struct fuse_copy_state *cs, int write, struct iov_iter *iter) { memset(cs, 0, sizeof(*cs)); cs->write = write; cs->iter = iter; } /* Unmap and put previous page of userspace buffer */ static void fuse_copy_finish(struct fuse_copy_state *cs) { if (cs->currbuf) { struct pipe_buffer *buf = cs->currbuf; if (cs->write) buf->len = PAGE_SIZE - cs->len; cs->currbuf = NULL; } else if (cs->pg) { if (cs->write) { flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); } put_page(cs->pg); } cs->pg = NULL; } /* * Get another pagefull of userspace buffer, and map it to kernel * address space, and lock request */ static int fuse_copy_fill(struct fuse_copy_state *cs) { struct page *page; int err; err = unlock_request(cs->req); if (err) return err; fuse_copy_finish(cs); if (cs->pipebufs) { struct pipe_buffer *buf = cs->pipebufs; if (!cs->write) { err = pipe_buf_confirm(cs->pipe, buf); if (err) return err; BUG_ON(!cs->nr_segs); cs->currbuf = buf; cs->pg = buf->page; cs->offset = buf->offset; cs->len = buf->len; cs->pipebufs++; cs->nr_segs--; } else { if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; page = alloc_page(GFP_HIGHUSER); if (!page) return -ENOMEM; buf->page = page; buf->offset = 0; buf->len = 0; cs->currbuf = buf; cs->pg = page; cs->offset = 0; cs->len = PAGE_SIZE; cs->pipebufs++; cs->nr_segs++; } } else { size_t off; err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off); if (err < 0) return err; BUG_ON(!err); cs->len = err; cs->offset = off; cs->pg = page; } return lock_request(cs->req); } /* Do as much copy to/from userspace buffer as we can */ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) { unsigned ncpy = min(*size, cs->len); if (val) { void *pgaddr = kmap_local_page(cs->pg); void *buf = pgaddr + cs->offset; if (cs->write) memcpy(buf, *val, ncpy); else memcpy(*val, buf, ncpy); kunmap_local(pgaddr); *val += ncpy; } *size -= ncpy; cs->len -= ncpy; cs->offset += ncpy; return ncpy; } static int fuse_check_folio(struct folio *folio) { if (folio_mapped(folio) || folio->mapping != NULL || (folio->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | 1 << PG_uptodate | 1 << PG_lru | 1 << PG_active | 1 << PG_workingset | 1 << PG_reclaim | 1 << PG_waiters | LRU_GEN_MASK | LRU_REFS_MASK))) { dump_page(&folio->page, "fuse: trying to steal weird page"); return 1; } return 0; } static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) { int err; struct folio *oldfolio = page_folio(*pagep); struct folio *newfolio; struct pipe_buffer *buf = cs->pipebufs; folio_get(oldfolio); err = unlock_request(cs->req); if (err) goto out_put_old; fuse_copy_finish(cs); err = pipe_buf_confirm(cs->pipe, buf); if (err) goto out_put_old; BUG_ON(!cs->nr_segs); cs->currbuf = buf; cs->len = buf->len; cs->pipebufs++; cs->nr_segs--; if (cs->len != PAGE_SIZE) goto out_fallback; if (!pipe_buf_try_steal(cs->pipe, buf)) goto out_fallback; newfolio = page_folio(buf->page); if (!folio_test_uptodate(newfolio)) folio_mark_uptodate(newfolio); folio_clear_mappedtodisk(newfolio); if (fuse_check_folio(newfolio) != 0) goto out_fallback_unlock; /* * This is a new and locked page, it shouldn't be mapped or * have any special flags on it */ if (WARN_ON(folio_mapped(oldfolio))) goto out_fallback_unlock; if (WARN_ON(folio_has_private(oldfolio))) goto out_fallback_unlock; if (WARN_ON(folio_test_dirty(oldfolio) || folio_test_writeback(oldfolio))) goto out_fallback_unlock; if (WARN_ON(folio_test_mlocked(oldfolio))) goto out_fallback_unlock; replace_page_cache_folio(oldfolio, newfolio); folio_get(newfolio); if (!(buf->flags & PIPE_BUF_FLAG_LRU)) folio_add_lru(newfolio); /* * Release while we have extra ref on stolen page. Otherwise * anon_pipe_buf_release() might think the page can be reused. */ pipe_buf_release(cs->pipe, buf); err = 0; spin_lock(&cs->req->waitq.lock); if (test_bit(FR_ABORTED, &cs->req->flags)) err = -ENOENT; else *pagep = &newfolio->page; spin_unlock(&cs->req->waitq.lock); if (err) { folio_unlock(newfolio); folio_put(newfolio); goto out_put_old; } folio_unlock(oldfolio); /* Drop ref for ap->pages[] array */ folio_put(oldfolio); cs->len = 0; err = 0; out_put_old: /* Drop ref obtained in this function */ folio_put(oldfolio); return err; out_fallback_unlock: folio_unlock(newfolio); out_fallback: cs->pg = buf->page; cs->offset = buf->offset; err = lock_request(cs->req); if (!err) err = 1; goto out_put_old; } static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, unsigned offset, unsigned count) { struct pipe_buffer *buf; int err; if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; get_page(page); err = unlock_request(cs->req); if (err) { put_page(page); return err; } fuse_copy_finish(cs); buf = cs->pipebufs; buf->page = page; buf->offset = offset; buf->len = count; cs->pipebufs++; cs->nr_segs++; cs->len = 0; return 0; } /* * Copy a page in the request to/from the userspace buffer. Must be * done atomically */ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, unsigned offset, unsigned count, int zeroing) { int err; struct page *page = *pagep; if (page && zeroing && count < PAGE_SIZE) clear_highpage(page); while (count) { if (cs->write && cs->pipebufs && page) { /* * Can't control lifetime of pipe buffers, so always * copy user pages. */ if (cs->req->args->user_pages) { err = fuse_copy_fill(cs); if (err) return err; } else { return fuse_ref_page(cs, page, offset, count); } } else if (!cs->len) { if (cs->move_pages && page && offset == 0 && count == PAGE_SIZE) { err = fuse_try_move_page(cs, pagep); if (err <= 0) return err; } else { err = fuse_copy_fill(cs); if (err) return err; } } if (page) { void *mapaddr = kmap_local_page(page); void *buf = mapaddr + offset; offset += fuse_copy_do(cs, &buf, &count); kunmap_local(mapaddr); } else offset += fuse_copy_do(cs, NULL, &count); } if (page && !cs->write) flush_dcache_page(page); return 0; } /* Copy pages in the request to/from userspace buffer */ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, int zeroing) { unsigned i; struct fuse_req *req = cs->req; struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) { int err; unsigned int offset = ap->descs[i].offset; unsigned int count = min(nbytes, ap->descs[i].length); err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing); if (err) return err; nbytes -= count; } return 0; } /* Copy a single argument in the request to/from userspace buffer */ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) { while (size) { if (!cs->len) { int err = fuse_copy_fill(cs); if (err) return err; } fuse_copy_do(cs, &val, &size); } return 0; } /* Copy request arguments to/from userspace buffer */ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, unsigned argpages, struct fuse_arg *args, int zeroing) { int err = 0; unsigned i; for (i = 0; !err && i < numargs; i++) { struct fuse_arg *arg = &args[i]; if (i == numargs - 1 && argpages) err = fuse_copy_pages(cs, arg->size, zeroing); else err = fuse_copy_one(cs, arg->value, arg->size); } return err; } static int forget_pending(struct fuse_iqueue *fiq) { return fiq->forget_list_head.next != NULL; } static int request_pending(struct fuse_iqueue *fiq) { return !list_empty(&fiq->pending) || !list_empty(&fiq->interrupts) || forget_pending(fiq); } /* * Transfer an interrupt request to userspace * * Unlike other requests this is assembled on demand, without a need * to allocate a separate fuse_req structure. * * Called with fiq->lock held, releases it */ static int fuse_read_interrupt(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes, struct fuse_req *req) __releases(fiq->lock) { struct fuse_in_header ih; struct fuse_interrupt_in arg; unsigned reqsize = sizeof(ih) + sizeof(arg); int err; list_del_init(&req->intr_entry); memset(&ih, 0, sizeof(ih)); memset(&arg, 0, sizeof(arg)); ih.len = reqsize; ih.opcode = FUSE_INTERRUPT; ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT); arg.unique = req->in.h.unique; spin_unlock(&fiq->lock); if (nbytes < reqsize) return -EINVAL; err = fuse_copy_one(cs, &ih, sizeof(ih)); if (!err) err = fuse_copy_one(cs, &arg, sizeof(arg)); fuse_copy_finish(cs); return err ? err : reqsize; } struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, unsigned int max, unsigned int *countp) { struct fuse_forget_link *head = fiq->forget_list_head.next; struct fuse_forget_link **newhead = &head; unsigned count; for (count = 0; *newhead != NULL && count < max; count++) newhead = &(*newhead)->next; fiq->forget_list_head.next = *newhead; *newhead = NULL; if (fiq->forget_list_head.next == NULL) fiq->forget_list_tail = &fiq->forget_list_head; if (countp != NULL) *countp = count; return head; } EXPORT_SYMBOL(fuse_dequeue_forget); static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) __releases(fiq->lock) { int err; struct fuse_forget_link *forget = fuse_dequeue_forget(fiq, 1, NULL); struct fuse_forget_in arg = { .nlookup = forget->forget_one.nlookup, }; struct fuse_in_header ih = { .opcode = FUSE_FORGET, .nodeid = forget->forget_one.nodeid, .unique = fuse_get_unique(fiq), .len = sizeof(ih) + sizeof(arg), }; spin_unlock(&fiq->lock); kfree(forget); if (nbytes < ih.len) return -EINVAL; err = fuse_copy_one(cs, &ih, sizeof(ih)); if (!err) err = fuse_copy_one(cs, &arg, sizeof(arg)); fuse_copy_finish(cs); if (err) return err; return ih.len; } static int fuse_read_batch_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) __releases(fiq->lock) { int err; unsigned max_forgets; unsigned count; struct fuse_forget_link *head; struct fuse_batch_forget_in arg = { .count = 0 }; struct fuse_in_header ih = { .opcode = FUSE_BATCH_FORGET, .unique = fuse_get_unique(fiq), .len = sizeof(ih) + sizeof(arg), }; if (nbytes < ih.len) { spin_unlock(&fiq->lock); return -EINVAL; } max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); head = fuse_dequeue_forget(fiq, max_forgets, &count); spin_unlock(&fiq->lock); arg.count = count; ih.len += count * sizeof(struct fuse_forget_one); err = fuse_copy_one(cs, &ih, sizeof(ih)); if (!err) err = fuse_copy_one(cs, &arg, sizeof(arg)); while (head) { struct fuse_forget_link *forget = head; if (!err) { err = fuse_copy_one(cs, &forget->forget_one, sizeof(forget->forget_one)); } head = forget->next; kfree(forget); } fuse_copy_finish(cs); if (err) return err; return ih.len; } static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) __releases(fiq->lock) { if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL) return fuse_read_single_forget(fiq, cs, nbytes); else return fuse_read_batch_forget(fiq, cs, nbytes); } /* * Read a single request into the userspace filesystem's buffer. This * function waits until a request is available, then removes it from * the pending list and copies request data to userspace buffer. If * no reply is needed (FORGET) or request has been aborted or there * was an error during the copying then it's finished by calling * fuse_request_end(). Otherwise add it to the processing list, and set * the 'sent' flag. */ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_copy_state *cs, size_t nbytes) { ssize_t err; struct fuse_conn *fc = fud->fc; struct fuse_iqueue *fiq = &fc->iq; struct fuse_pqueue *fpq = &fud->pq; struct fuse_req *req; struct fuse_args *args; unsigned reqsize; unsigned int hash; /* * Require sane minimum read buffer - that has capacity for fixed part * of any request header + negotiated max_write room for data. * * Historically libfuse reserves 4K for fixed header room, but e.g. * GlusterFS reserves only 80 bytes * * = `sizeof(fuse_in_header) + sizeof(fuse_write_in)` * * which is the absolute minimum any sane filesystem should be using * for header room. */ if (nbytes < max_t(size_t, FUSE_MIN_READ_BUFFER, sizeof(struct fuse_in_header) + sizeof(struct fuse_write_in) + fc->max_write)) return -EINVAL; restart: for (;;) { spin_lock(&fiq->lock); if (!fiq->connected || request_pending(fiq)) break; spin_unlock(&fiq->lock); if (file->f_flags & O_NONBLOCK) return -EAGAIN; err = wait_event_interruptible_exclusive(fiq->waitq, !fiq->connected || request_pending(fiq)); if (err) return err; } if (!fiq->connected) { err = fc->aborted ? -ECONNABORTED : -ENODEV; goto err_unlock; } if (!list_empty(&fiq->interrupts)) { req = list_entry(fiq->interrupts.next, struct fuse_req, intr_entry); return fuse_read_interrupt(fiq, cs, nbytes, req); } if (forget_pending(fiq)) { if (list_empty(&fiq->pending) || fiq->forget_batch-- > 0) return fuse_read_forget(fc, fiq, cs, nbytes); if (fiq->forget_batch <= -8) fiq->forget_batch = 16; } req = list_entry(fiq->pending.next, struct fuse_req, list); clear_bit(FR_PENDING, &req->flags); list_del_init(&req->list); spin_unlock(&fiq->lock); args = req->args; reqsize = req->in.h.len; /* If request is too large, reply with an error and restart the read */ if (nbytes < reqsize) { req->out.h.error = -EIO; /* SETXATTR is special, since it may contain too large data */ if (args->opcode == FUSE_SETXATTR) req->out.h.error = -E2BIG; fuse_request_end(req); goto restart; } spin_lock(&fpq->lock); /* * Must not put request on fpq->io queue after having been shut down by * fuse_abort_conn() */ if (!fpq->connected) { req->out.h.error = err = -ECONNABORTED; goto out_end; } list_add(&req->list, &fpq->io); spin_unlock(&fpq->lock); cs->req = req; err = fuse_copy_one(cs, &req->in.h, sizeof(req->in.h)); if (!err) err = fuse_copy_args(cs, args->in_numargs, args->in_pages, (struct fuse_arg *) args->in_args, 0); fuse_copy_finish(cs); spin_lock(&fpq->lock); clear_bit(FR_LOCKED, &req->flags); if (!fpq->connected) { err = fc->aborted ? -ECONNABORTED : -ENODEV; goto out_end; } if (err) { req->out.h.error = -EIO; goto out_end; } if (!test_bit(FR_ISREPLY, &req->flags)) { err = reqsize; goto out_end; } hash = fuse_req_hash(req->in.h.unique); list_move_tail(&req->list, &fpq->processing[hash]); __fuse_get_request(req); set_bit(FR_SENT, &req->flags); spin_unlock(&fpq->lock); /* matches barrier in request_wait_answer() */ smp_mb__after_atomic(); if (test_bit(FR_INTERRUPTED, &req->flags)) queue_interrupt(req); fuse_put_request(req); return reqsize; out_end: if (!test_bit(FR_PRIVATE, &req->flags)) list_del_init(&req->list); spin_unlock(&fpq->lock); fuse_request_end(req); return err; err_unlock: spin_unlock(&fiq->lock); return err; } static int fuse_dev_open(struct inode *inode, struct file *file) { /* * The fuse device's file's private_data is used to hold * the fuse_conn(ection) when it is mounted, and is used to * keep track of whether the file has been mounted already. */ file->private_data = NULL; return 0; } static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) { struct fuse_copy_state cs; struct file *file = iocb->ki_filp; struct fuse_dev *fud = fuse_get_dev(file); if (!fud) return -EPERM; if (!user_backed_iter(to)) return -EINVAL; fuse_copy_init(&cs, 1, to); return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to)); } static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { int total, ret; int page_nr = 0; struct pipe_buffer *bufs; struct fuse_copy_state cs; struct fuse_dev *fud = fuse_get_dev(in); if (!fud) return -EPERM; bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) return -ENOMEM; fuse_copy_init(&cs, 1, NULL); cs.pipebufs = bufs; cs.pipe = pipe; ret = fuse_dev_do_read(fud, in, &cs, len); if (ret < 0) goto out; if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) { ret = -EIO; goto out; } for (ret = total = 0; page_nr < cs.nr_segs; total += ret) { /* * Need to be careful about this. Having buf->ops in module * code can Oops if the buffer persists after module unload. */ bufs[page_nr].ops = &nosteal_pipe_buf_ops; bufs[page_nr].flags = 0; ret = add_to_pipe(pipe, &bufs[page_nr++]); if (unlikely(ret < 0)) break; } if (total) ret = total; out: for (; page_nr < cs.nr_segs; page_nr++) put_page(bufs[page_nr].page); kvfree(bufs); return ret; } static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_poll_wakeup_out outarg; int err = -EINVAL; if (size != sizeof(outarg)) goto err; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) goto err; fuse_copy_finish(cs); return fuse_notify_poll_wakeup(fc, &outarg); err: fuse_copy_finish(cs); return err; } static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_inval_inode_out outarg; int err = -EINVAL; if (size != sizeof(outarg)) goto err; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) goto err; fuse_copy_finish(cs); down_read(&fc->killsb); err = fuse_reverse_inval_inode(fc, outarg.ino, outarg.off, outarg.len); up_read(&fc->killsb); return err; err: fuse_copy_finish(cs); return err; } static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_inval_entry_out outarg; int err = -ENOMEM; char *buf; struct qstr name; buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); if (!buf) goto err; err = -EINVAL; if (size < sizeof(outarg)) goto err; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) goto err; err = -ENAMETOOLONG; if (outarg.namelen > FUSE_NAME_MAX) goto err; err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) goto err; name.name = buf; name.len = outarg.namelen; err = fuse_copy_one(cs, buf, outarg.namelen + 1); if (err) goto err; fuse_copy_finish(cs); buf[outarg.namelen] = 0; down_read(&fc->killsb); err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags); up_read(&fc->killsb); kfree(buf); return err; err: kfree(buf); fuse_copy_finish(cs); return err; } static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_delete_out outarg; int err = -ENOMEM; char *buf; struct qstr name; buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); if (!buf) goto err; err = -EINVAL; if (size < sizeof(outarg)) goto err; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) goto err; err = -ENAMETOOLONG; if (outarg.namelen > FUSE_NAME_MAX) goto err; err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) goto err; name.name = buf; name.len = outarg.namelen; err = fuse_copy_one(cs, buf, outarg.namelen + 1); if (err) goto err; fuse_copy_finish(cs); buf[outarg.namelen] = 0; down_read(&fc->killsb); err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0); up_read(&fc->killsb); kfree(buf); return err; err: kfree(buf); fuse_copy_finish(cs); return err; } static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_store_out outarg; struct inode *inode; struct address_space *mapping; u64 nodeid; int err; pgoff_t index; unsigned int offset; unsigned int num; loff_t file_size; loff_t end; err = -EINVAL; if (size < sizeof(outarg)) goto out_finish; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) goto out_finish; err = -EINVAL; if (size - sizeof(outarg) != outarg.size) goto out_finish; nodeid = outarg.nodeid; down_read(&fc->killsb); err = -ENOENT; inode = fuse_ilookup(fc, nodeid, NULL); if (!inode) goto out_up_killsb; mapping = inode->i_mapping; index = outarg.offset >> PAGE_SHIFT; offset = outarg.offset & ~PAGE_MASK; file_size = i_size_read(inode); end = outarg.offset + outarg.size; if (end > file_size) { file_size = end; fuse_write_update_attr(inode, file_size, outarg.size); } num = outarg.size; while (num) { struct page *page; unsigned int this_num; err = -ENOMEM; page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); if (!page) goto out_iput; this_num = min_t(unsigned, num, PAGE_SIZE - offset); err = fuse_copy_page(cs, &page, offset, this_num, 0); if (!err && offset == 0 && (this_num == PAGE_SIZE || file_size == end)) SetPageUptodate(page); unlock_page(page); put_page(page); if (err) goto out_iput; num -= this_num; offset = 0; index++; } err = 0; out_iput: iput(inode); out_up_killsb: up_read(&fc->killsb); out_finish: fuse_copy_finish(cs); return err; } struct fuse_retrieve_args { struct fuse_args_pages ap; struct fuse_notify_retrieve_in inarg; }; static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_retrieve_args *ra = container_of(args, typeof(*ra), ap.args); release_pages(ra->ap.pages, ra->ap.num_pages); kfree(ra); } static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, struct fuse_notify_retrieve_out *outarg) { int err; struct address_space *mapping = inode->i_mapping; pgoff_t index; loff_t file_size; unsigned int num; unsigned int offset; size_t total_len = 0; unsigned int num_pages; struct fuse_conn *fc = fm->fc; struct fuse_retrieve_args *ra; size_t args_size = sizeof(*ra); struct fuse_args_pages *ap; struct fuse_args *args; offset = outarg->offset & ~PAGE_MASK; file_size = i_size_read(inode); num = min(outarg->size, fc->max_write); if (outarg->offset > file_size) num = 0; else if (outarg->offset + num > file_size) num = file_size - outarg->offset; num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; num_pages = min(num_pages, fc->max_pages); args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0])); ra = kzalloc(args_size, GFP_KERNEL); if (!ra) return -ENOMEM; ap = &ra->ap; ap->pages = (void *) (ra + 1); ap->descs = (void *) (ap->pages + num_pages); args = &ap->args; args->nodeid = outarg->nodeid; args->opcode = FUSE_NOTIFY_REPLY; args->in_numargs = 2; args->in_pages = true; args->end = fuse_retrieve_end; index = outarg->offset >> PAGE_SHIFT; while (num && ap->num_pages < num_pages) { struct page *page; unsigned int this_num; page = find_get_page(mapping, index); if (!page) break; this_num = min_t(unsigned, num, PAGE_SIZE - offset); ap->pages[ap->num_pages] = page; ap->descs[ap->num_pages].offset = offset; ap->descs[ap->num_pages].length = this_num; ap->num_pages++; offset = 0; num -= this_num; total_len += this_num; index++; } ra->inarg.offset = outarg->offset; ra->inarg.size = total_len; args->in_args[0].size = sizeof(ra->inarg); args->in_args[0].value = &ra->inarg; args->in_args[1].size = total_len; err = fuse_simple_notify_reply(fm, args, outarg->notify_unique); if (err) fuse_retrieve_end(fm, args, err); return err; } static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_retrieve_out outarg; struct fuse_mount *fm; struct inode *inode; u64 nodeid; int err; err = -EINVAL; if (size != sizeof(outarg)) goto copy_finish; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) goto copy_finish; fuse_copy_finish(cs); down_read(&fc->killsb); err = -ENOENT; nodeid = outarg.nodeid; inode = fuse_ilookup(fc, nodeid, &fm); if (inode) { err = fuse_retrieve(fm, inode, &outarg); iput(inode); } up_read(&fc->killsb); return err; copy_finish: fuse_copy_finish(cs); return err; } /* * Resending all processing queue requests. * * During a FUSE daemon panics and failover, it is possible for some inflight * requests to be lost and never returned. As a result, applications awaiting * replies would become stuck forever. To address this, we can use notification * to trigger resending of these pending requests to the FUSE daemon, ensuring * they are properly processed again. * * Please note that this strategy is applicable only to idempotent requests or * if the FUSE daemon takes careful measures to avoid processing duplicated * non-idempotent requests. */ static void fuse_resend(struct fuse_conn *fc) { struct fuse_dev *fud; struct fuse_req *req, *next; struct fuse_iqueue *fiq = &fc->iq; LIST_HEAD(to_queue); unsigned int i; spin_lock(&fc->lock); if (!fc->connected) { spin_unlock(&fc->lock); return; } list_for_each_entry(fud, &fc->devices, entry) { struct fuse_pqueue *fpq = &fud->pq; spin_lock(&fpq->lock); for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) list_splice_tail_init(&fpq->processing[i], &to_queue); spin_unlock(&fpq->lock); } spin_unlock(&fc->lock); list_for_each_entry_safe(req, next, &to_queue, list) { set_bit(FR_PENDING, &req->flags); clear_bit(FR_SENT, &req->flags); /* mark the request as resend request */ req->in.h.unique |= FUSE_UNIQUE_RESEND; } spin_lock(&fiq->lock); /* iq and pq requests are both oldest to newest */ list_splice(&to_queue, &fiq->pending); fiq->ops->wake_pending_and_unlock(fiq); } static int fuse_notify_resend(struct fuse_conn *fc) { fuse_resend(fc); return 0; } static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { /* Don't try to move pages (yet) */ cs->move_pages = 0; switch (code) { case FUSE_NOTIFY_POLL: return fuse_notify_poll(fc, size, cs); case FUSE_NOTIFY_INVAL_INODE: return fuse_notify_inval_inode(fc, size, cs); case FUSE_NOTIFY_INVAL_ENTRY: return fuse_notify_inval_entry(fc, size, cs); case FUSE_NOTIFY_STORE: return fuse_notify_store(fc, size, cs); case FUSE_NOTIFY_RETRIEVE: return fuse_notify_retrieve(fc, size, cs); case FUSE_NOTIFY_DELETE: return fuse_notify_delete(fc, size, cs); case FUSE_NOTIFY_RESEND: return fuse_notify_resend(fc); default: fuse_copy_finish(cs); return -EINVAL; } } /* Look up request on processing list by unique ID */ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) { unsigned int hash = fuse_req_hash(unique); struct fuse_req *req; list_for_each_entry(req, &fpq->processing[hash], list) { if (req->in.h.unique == unique) return req; } return NULL; } static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, unsigned nbytes) { unsigned reqsize = sizeof(struct fuse_out_header); reqsize += fuse_len_args(args->out_numargs, args->out_args); if (reqsize < nbytes || (reqsize > nbytes && !args->out_argvar)) return -EINVAL; else if (reqsize > nbytes) { struct fuse_arg *lastarg = &args->out_args[args->out_numargs-1]; unsigned diffsize = reqsize - nbytes; if (diffsize > lastarg->size) return -EINVAL; lastarg->size -= diffsize; } return fuse_copy_args(cs, args->out_numargs, args->out_pages, args->out_args, args->page_zeroing); } /* * Write a single reply to a request. First the header is copied from * the write buffer. The request is then searched on the processing * list by the unique ID found in the header. If found, then remove * it from the list and copy the rest of the buffer to the request. * The request is finished by calling fuse_request_end(). */ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_copy_state *cs, size_t nbytes) { int err; struct fuse_conn *fc = fud->fc; struct fuse_pqueue *fpq = &fud->pq; struct fuse_req *req; struct fuse_out_header oh; err = -EINVAL; if (nbytes < sizeof(struct fuse_out_header)) goto out; err = fuse_copy_one(cs, &oh, sizeof(oh)); if (err) goto copy_finish; err = -EINVAL; if (oh.len != nbytes) goto copy_finish; /* * Zero oh.unique indicates unsolicited notification message * and error contains notification code. */ if (!oh.unique) { err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); goto out; } err = -EINVAL; if (oh.error <= -512 || oh.error > 0) goto copy_finish; spin_lock(&fpq->lock); req = NULL; if (fpq->connected) req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); err = -ENOENT; if (!req) { spin_unlock(&fpq->lock); goto copy_finish; } /* Is it an interrupt reply ID? */ if (oh.unique & FUSE_INT_REQ_BIT) { __fuse_get_request(req); spin_unlock(&fpq->lock); err = 0; if (nbytes != sizeof(struct fuse_out_header)) err = -EINVAL; else if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) err = queue_interrupt(req); fuse_put_request(req); goto copy_finish; } clear_bit(FR_SENT, &req->flags); list_move(&req->list, &fpq->io); req->out.h = oh; set_bit(FR_LOCKED, &req->flags); spin_unlock(&fpq->lock); cs->req = req; if (!req->args->page_replace) cs->move_pages = 0; if (oh.error) err = nbytes != sizeof(oh) ? -EINVAL : 0; else err = copy_out_args(cs, req->args, nbytes); fuse_copy_finish(cs); spin_lock(&fpq->lock); clear_bit(FR_LOCKED, &req->flags); if (!fpq->connected) err = -ENOENT; else if (err) req->out.h.error = -EIO; if (!test_bit(FR_PRIVATE, &req->flags)) list_del_init(&req->list); spin_unlock(&fpq->lock); fuse_request_end(req); out: return err ? err : nbytes; copy_finish: fuse_copy_finish(cs); goto out; } static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) { struct fuse_copy_state cs; struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp); if (!fud) return -EPERM; if (!user_backed_iter(from)) return -EINVAL; fuse_copy_init(&cs, 0, from); return fuse_dev_do_write(fud, &cs, iov_iter_count(from)); } static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { unsigned int head, tail, mask, count; unsigned nbuf; unsigned idx; struct pipe_buffer *bufs; struct fuse_copy_state cs; struct fuse_dev *fud; size_t rem; ssize_t ret; fud = fuse_get_dev(out); if (!fud) return -EPERM; pipe_lock(pipe); head = pipe->head; tail = pipe->tail; mask = pipe->ring_size - 1; count = head - tail; bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) { pipe_unlock(pipe); return -ENOMEM; } nbuf = 0; rem = 0; for (idx = tail; idx != head && rem < len; idx++) rem += pipe->bufs[idx & mask].len; ret = -EINVAL; if (rem < len) goto out_free; rem = len; while (rem) { struct pipe_buffer *ibuf; struct pipe_buffer *obuf; if (WARN_ON(nbuf >= count || tail == head)) goto out_free; ibuf = &pipe->bufs[tail & mask]; obuf = &bufs[nbuf]; if (rem >= ibuf->len) { *obuf = *ibuf; ibuf->ops = NULL; tail++; pipe->tail = tail; } else { if (!pipe_buf_get(pipe, ibuf)) goto out_free; *obuf = *ibuf; obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->len = rem; ibuf->offset += obuf->len; ibuf->len -= obuf->len; } nbuf++; rem -= obuf->len; } pipe_unlock(pipe); fuse_copy_init(&cs, 0, NULL); cs.pipebufs = bufs; cs.nr_segs = nbuf; cs.pipe = pipe; if (flags & SPLICE_F_MOVE) cs.move_pages = 1; ret = fuse_dev_do_write(fud, &cs, len); pipe_lock(pipe); out_free: for (idx = 0; idx < nbuf; idx++) { struct pipe_buffer *buf = &bufs[idx]; if (buf->ops) pipe_buf_release(pipe, buf); } pipe_unlock(pipe); kvfree(bufs); return ret; } static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) { __poll_t mask = EPOLLOUT | EPOLLWRNORM; struct fuse_iqueue *fiq; struct fuse_dev *fud = fuse_get_dev(file); if (!fud) return EPOLLERR; fiq = &fud->fc->iq; poll_wait(file, &fiq->waitq, wait); spin_lock(&fiq->lock); if (!fiq->connected) mask = EPOLLERR; else if (request_pending(fiq)) mask |= EPOLLIN | EPOLLRDNORM; spin_unlock(&fiq->lock); return mask; } /* Abort all requests on the given list (pending or processing) */ static void end_requests(struct list_head *head) { while (!list_empty(head)) { struct fuse_req *req; req = list_entry(head->next, struct fuse_req, list); req->out.h.error = -ECONNABORTED; clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); fuse_request_end(req); } } static void end_polls(struct fuse_conn *fc) { struct rb_node *p; p = rb_first(&fc->polled_files); while (p) { struct fuse_file *ff; ff = rb_entry(p, struct fuse_file, polled_node); wake_up_interruptible_all(&ff->poll_wait); p = rb_next(p); } } /* * Abort all requests. * * Emergency exit in case of a malicious or accidental deadlock, or just a hung * filesystem. * * The same effect is usually achievable through killing the filesystem daemon * and all users of the filesystem. The exception is the combination of an * asynchronous request and the tricky deadlock (see * Documentation/filesystems/fuse.rst). * * Aborting requests under I/O goes as follows: 1: Separate out unlocked * requests, they should be finished off immediately. Locked requests will be * finished after unlock; see unlock_request(). 2: Finish off the unlocked * requests. It is possible that some request will finish before we can. This * is OK, the request will in that case be removed from the list before we touch * it. */ void fuse_abort_conn(struct fuse_conn *fc) { struct fuse_iqueue *fiq = &fc->iq; spin_lock(&fc->lock); if (fc->connected) { struct fuse_dev *fud; struct fuse_req *req, *next; LIST_HEAD(to_end); unsigned int i; /* Background queuing checks fc->connected under bg_lock */ spin_lock(&fc->bg_lock); fc->connected = 0; spin_unlock(&fc->bg_lock); fuse_set_initialized(fc); list_for_each_entry(fud, &fc->devices, entry) { struct fuse_pqueue *fpq = &fud->pq; spin_lock(&fpq->lock); fpq->connected = 0; list_for_each_entry_safe(req, next, &fpq->io, list) { req->out.h.error = -ECONNABORTED; spin_lock(&req->waitq.lock); set_bit(FR_ABORTED, &req->flags); if (!test_bit(FR_LOCKED, &req->flags)) { set_bit(FR_PRIVATE, &req->flags); __fuse_get_request(req); list_move(&req->list, &to_end); } spin_unlock(&req->waitq.lock); } for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) list_splice_tail_init(&fpq->processing[i], &to_end); spin_unlock(&fpq->lock); } spin_lock(&fc->bg_lock); fc->blocked = 0; fc->max_background = UINT_MAX; flush_bg_queue(fc); spin_unlock(&fc->bg_lock); spin_lock(&fiq->lock); fiq->connected = 0; list_for_each_entry(req, &fiq->pending, list) clear_bit(FR_PENDING, &req->flags); list_splice_tail_init(&fiq->pending, &to_end); while (forget_pending(fiq)) kfree(fuse_dequeue_forget(fiq, 1, NULL)); wake_up_all(&fiq->waitq); spin_unlock(&fiq->lock); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); end_polls(fc); wake_up_all(&fc->blocked_waitq); spin_unlock(&fc->lock); end_requests(&to_end); } else { spin_unlock(&fc->lock); } } EXPORT_SYMBOL_GPL(fuse_abort_conn); void fuse_wait_aborted(struct fuse_conn *fc) { /* matches implicit memory barrier in fuse_drop_waiting() */ smp_mb(); wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0); } int fuse_dev_release(struct inode *inode, struct file *file) { struct fuse_dev *fud = fuse_get_dev(file); if (fud) { struct fuse_conn *fc = fud->fc; struct fuse_pqueue *fpq = &fud->pq; LIST_HEAD(to_end); unsigned int i; spin_lock(&fpq->lock); WARN_ON(!list_empty(&fpq->io)); for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) list_splice_init(&fpq->processing[i], &to_end); spin_unlock(&fpq->lock); end_requests(&to_end); /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { WARN_ON(fc->iq.fasync != NULL); fuse_abort_conn(fc); } fuse_dev_free(fud); } return 0; } EXPORT_SYMBOL_GPL(fuse_dev_release); static int fuse_dev_fasync(int fd, struct file *file, int on) { struct fuse_dev *fud = fuse_get_dev(file); if (!fud) return -EPERM; /* No locking - fasync_helper does its own locking */ return fasync_helper(fd, file, on, &fud->fc->iq.fasync); } static int fuse_device_clone(struct fuse_conn *fc, struct file *new) { struct fuse_dev *fud; if (new->private_data) return -EINVAL; fud = fuse_dev_alloc_install(fc); if (!fud) return -ENOMEM; new->private_data = fud; atomic_inc(&fc->dev_count); return 0; } static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) { int res; int oldfd; struct fuse_dev *fud = NULL; struct fd f; if (get_user(oldfd, argp)) return -EFAULT; f = fdget(oldfd); if (!f.file) return -EINVAL; /* * Check against file->f_op because CUSE * uses the same ioctl handler. */ if (f.file->f_op == file->f_op) fud = fuse_get_dev(f.file); res = -EINVAL; if (fud) { mutex_lock(&fuse_mutex); res = fuse_device_clone(fud->fc, file); mutex_unlock(&fuse_mutex); } fdput(f); return res; } static long fuse_dev_ioctl_backing_open(struct file *file, struct fuse_backing_map __user *argp) { struct fuse_dev *fud = fuse_get_dev(file); struct fuse_backing_map map; if (!fud) return -EPERM; if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; if (copy_from_user(&map, argp, sizeof(map))) return -EFAULT; return fuse_backing_open(fud->fc, &map); } static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) { struct fuse_dev *fud = fuse_get_dev(file); int backing_id; if (!fud) return -EPERM; if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; if (get_user(backing_id, argp)) return -EFAULT; return fuse_backing_close(fud->fc, backing_id); } static long fuse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; switch (cmd) { case FUSE_DEV_IOC_CLONE: return fuse_dev_ioctl_clone(file, argp); case FUSE_DEV_IOC_BACKING_OPEN: return fuse_dev_ioctl_backing_open(file, argp); case FUSE_DEV_IOC_BACKING_CLOSE: return fuse_dev_ioctl_backing_close(file, argp); default: return -ENOTTY; } } const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .open = fuse_dev_open, .llseek = no_llseek, .read_iter = fuse_dev_read, .splice_read = fuse_dev_splice_read, .write_iter = fuse_dev_write, .splice_write = fuse_dev_splice_write, .poll = fuse_dev_poll, .release = fuse_dev_release, .fasync = fuse_dev_fasync, .unlocked_ioctl = fuse_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, }; EXPORT_SYMBOL_GPL(fuse_dev_operations); static struct miscdevice fuse_miscdevice = { .minor = FUSE_MINOR, .name = "fuse", .fops = &fuse_dev_operations, }; int __init fuse_dev_init(void) { int err = -ENOMEM; fuse_req_cachep = kmem_cache_create("fuse_request", sizeof(struct fuse_req), 0, 0, NULL); if (!fuse_req_cachep) goto out; err = misc_register(&fuse_miscdevice); if (err) goto out_cache_clean; return 0; out_cache_clean: kmem_cache_destroy(fuse_req_cachep); out: return err; } void fuse_dev_cleanup(void) { misc_deregister(&fuse_miscdevice); kmem_cache_destroy(fuse_req_cachep); }
1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 // SPDX-License-Identifier: LGPL-2.1 /* * Copyright IBM Corporation, 2007 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * */ #include <linux/slab.h> #include "ext4_jbd2.h" #include "ext4_extents.h" /* * The contiguous blocks details which can be * represented by a single extent */ struct migrate_struct { ext4_lblk_t first_block, last_block, curr_block; ext4_fsblk_t first_pblock, last_pblock; }; static int finish_range(handle_t *handle, struct inode *inode, struct migrate_struct *lb) { int retval = 0, needed; struct ext4_extent newext; struct ext4_ext_path *path; if (lb->first_pblock == 0) return 0; /* Add the extent to temp inode*/ newext.ee_block = cpu_to_le32(lb->first_block); newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); ext4_ext_store_pblock(&newext, lb->first_pblock); /* Locking only for convenience since we are operating on temp inode */ down_write(&EXT4_I(inode)->i_data_sem); path = ext4_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { retval = PTR_ERR(path); path = NULL; goto err_out; } /* * Calculate the credit needed to inserting this extent * Since we are doing this in loop we may accumulate extra * credit. But below we try to not accumulate too much * of them by restarting the journal. */ needed = ext4_ext_calc_credits_for_single_extent(inode, lb->last_block - lb->first_block + 1, path); retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0); if (retval < 0) goto err_out; retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); err_out: up_write((&EXT4_I(inode)->i_data_sem)); ext4_free_ext_path(path); lb->first_pblock = 0; return retval; } static int update_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { int retval; /* * See if we can add on to the existing range (if it exists) */ if (lb->first_pblock && (lb->last_pblock+1 == pblock) && (lb->last_block+1 == lb->curr_block)) { lb->last_pblock = pblock; lb->last_block = lb->curr_block; lb->curr_block++; return 0; } /* * Start a new range. */ retval = finish_range(handle, inode, lb); lb->first_pblock = lb->last_pblock = pblock; lb->first_block = lb->last_block = lb->curr_block; lb->curr_block++; return retval; } static int update_ind_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, pblock, 0); if (IS_ERR(bh)) return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_extent_range(handle, inode, le32_to_cpu(i_data[i]), lb); if (retval) break; } else { lb->curr_block++; } } put_bh(bh); return retval; } static int update_dind_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, pblock, 0); if (IS_ERR(bh)) return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_ind_extent_range(handle, inode, le32_to_cpu(i_data[i]), lb); if (retval) break; } else { /* Only update the file block number */ lb->curr_block += max_entries; } } put_bh(bh); return retval; } static int update_tind_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, pblock, 0); if (IS_ERR(bh)) return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_dind_extent_range(handle, inode, le32_to_cpu(i_data[i]), lb); if (retval) break; } else { /* Only update the file block number */ lb->curr_block += max_entries * max_entries; } } put_bh(bh); return retval; } static int free_dind_blocks(handle_t *handle, struct inode *inode, __le32 i_data) { int i; __le32 *tmp_idata; struct buffer_head *bh; struct super_block *sb = inode->i_sb; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; int err; bh = ext4_sb_bread(sb, le32_to_cpu(i_data), 0); if (IS_ERR(bh)) return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(sb, 1)); if (err < 0) { put_bh(bh); return err; } ext4_free_blocks(handle, inode, NULL, le32_to_cpu(tmp_idata[i]), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } } put_bh(bh); err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(sb, 1)); if (err < 0) return err; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; } static int free_tind_blocks(handle_t *handle, struct inode *inode, __le32 i_data) { int i, retval = 0; __le32 *tmp_idata; struct buffer_head *bh; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); if (IS_ERR(bh)) return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { retval = free_dind_blocks(handle, inode, tmp_idata[i]); if (retval) { put_bh(bh); return retval; } } } put_bh(bh); retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (retval < 0) return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; } static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) { int retval; /* ei->i_data[EXT4_IND_BLOCK] */ if (i_data[0]) { retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (retval < 0) return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data[0]), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } /* ei->i_data[EXT4_DIND_BLOCK] */ if (i_data[1]) { retval = free_dind_blocks(handle, inode, i_data[1]); if (retval) return retval; } /* ei->i_data[EXT4_TIND_BLOCK] */ if (i_data[2]) { retval = free_tind_blocks(handle, inode, i_data[2]); if (retval) return retval; } return 0; } static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, struct inode *tmp_inode) { int retval, retval2 = 0; __le32 i_data[3]; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); /* * One credit accounted for writing the * i_data field of the original inode */ retval = ext4_journal_ensure_credits(handle, 1, 0); if (retval < 0) goto err_out; i_data[0] = ei->i_data[EXT4_IND_BLOCK]; i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; i_data[2] = ei->i_data[EXT4_TIND_BLOCK]; down_write(&EXT4_I(inode)->i_data_sem); /* * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation * happened after we started the migrate. We need to * fail the migrate */ if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) { retval = -EAGAIN; up_write(&EXT4_I(inode)->i_data_sem); goto err_out; } else ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); /* * We have the extent map build with the tmp inode. * Now copy the i_data across */ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); /* * Update i_blocks with the new blocks that got * allocated while adding extents for extent index * blocks. * * While converting to extents we need not * update the original inode i_blocks for extent blocks * via quota APIs. The quota update happened via tmp_inode already. */ spin_lock(&inode->i_lock); inode->i_blocks += tmp_inode->i_blocks; spin_unlock(&inode->i_lock); up_write(&EXT4_I(inode)->i_data_sem); /* * We mark the inode dirty after, because we decrement the * i_blocks when freeing the indirect meta-data blocks */ retval = free_ind_block(handle, inode, i_data); retval2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(retval2 && !retval)) retval = retval2; err_out: return retval; } static int free_ext_idx(handle_t *handle, struct inode *inode, struct ext4_extent_idx *ix) { int i, retval = 0; ext4_fsblk_t block; struct buffer_head *bh; struct ext4_extent_header *eh; block = ext4_idx_pblock(ix); bh = ext4_sb_bread(inode->i_sb, block, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = (struct ext4_extent_header *)bh->b_data; if (eh->eh_depth != 0) { ix = EXT_FIRST_INDEX(eh); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { retval = free_ext_idx(handle, inode, ix); if (retval) { put_bh(bh); return retval; } } } put_bh(bh); retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (retval < 0) return retval; ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; } /* * Free the extent meta data blocks only */ static int free_ext_block(handle_t *handle, struct inode *inode) { int i, retval = 0; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data; struct ext4_extent_idx *ix; if (eh->eh_depth == 0) /* * No extra blocks allocated for extent meta data */ return 0; ix = EXT_FIRST_INDEX(eh); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { retval = free_ext_idx(handle, inode, ix); if (retval) return retval; } return retval; } int ext4_ext_migrate(struct inode *inode) { handle_t *handle; int retval = 0, i; __le32 *i_data; struct ext4_inode_info *ei; struct inode *tmp_inode = NULL; struct migrate_struct lb; unsigned long max_entries; __u32 goal, tmp_csum_seed; uid_t owner[2]; int alloc_ctx; /* * If the filesystem does not support extents, or the inode * already is extent-based, error out. */ if (!ext4_has_feature_extents(inode->i_sb) || ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || ext4_has_inline_data(inode)) return -EINVAL; if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) /* * don't migrate fast symlink */ return retval; alloc_ctx = ext4_writepages_down_write(inode->i_sb); /* * Worst case we can touch the allocation bitmaps and a block * group descriptor block. We do need to worry about * credits for modifying the quota inode. */ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto out_unlock; } goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; owner[0] = i_uid_read(inode); owner[1] = i_gid_read(inode); tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root), S_IFREG, NULL, goal, owner, 0); if (IS_ERR(tmp_inode)) { retval = PTR_ERR(tmp_inode); ext4_journal_stop(handle); goto out_unlock; } /* * Use the correct seed for checksum (i.e. the seed from 'inode'). This * is so that the metadata blocks will have the correct checksum after * the migration. */ ei = EXT4_I(inode); tmp_csum_seed = EXT4_I(tmp_inode)->i_csum_seed; EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed; i_size_write(tmp_inode, i_size_read(inode)); /* * Set the i_nlink to zero so it will be deleted later * when we drop inode reference. */ clear_nlink(tmp_inode); ext4_ext_tree_init(handle, tmp_inode); ext4_journal_stop(handle); /* * start with one credit accounted for * superblock modification. * * For the tmp_inode we already have committed the * transaction that created the inode. Later as and * when we add extents we extent the journal */ /* * Even though we take i_rwsem we can still cause block * allocation via mmap write to holes. If we have allocated * new blocks we fail migrate. New block allocation will * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated * with i_data_sem held to prevent racing with block * allocation. */ down_read(&EXT4_I(inode)->i_data_sem); ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); up_read((&EXT4_I(inode)->i_data_sem)); handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto out_tmp_inode; } i_data = ei->i_data; memset(&lb, 0, sizeof(lb)); /* 32 bit block address 4 bytes */ max_entries = inode->i_sb->s_blocksize >> 2; for (i = 0; i < EXT4_NDIR_BLOCKS; i++) { if (i_data[i]) { retval = update_extent_range(handle, tmp_inode, le32_to_cpu(i_data[i]), &lb); if (retval) goto err_out; } else lb.curr_block++; } if (i_data[EXT4_IND_BLOCK]) { retval = update_ind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb); if (retval) goto err_out; } else lb.curr_block += max_entries; if (i_data[EXT4_DIND_BLOCK]) { retval = update_dind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb); if (retval) goto err_out; } else lb.curr_block += max_entries * max_entries; if (i_data[EXT4_TIND_BLOCK]) { retval = update_tind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb); if (retval) goto err_out; } /* * Build the last extent */ retval = finish_range(handle, tmp_inode, &lb); err_out: if (retval) /* * Failure case delete the extent information with the * tmp_inode */ free_ext_block(handle, tmp_inode); else { retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode); if (retval) /* * if we fail to swap inode data free the extent * details of the tmp inode */ free_ext_block(handle, tmp_inode); } /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ retval = ext4_journal_ensure_credits(handle, 1, 0); if (retval < 0) goto out_stop; /* * Mark the tmp_inode as of size zero */ i_size_write(tmp_inode, 0); /* * set the i_blocks count to zero * so that the ext4_evict_inode() does the * right job * * We don't need to take the i_lock because * the inode is not visible to user space. */ tmp_inode->i_blocks = 0; EXT4_I(tmp_inode)->i_csum_seed = tmp_csum_seed; /* Reset the extent details */ ext4_ext_tree_init(handle, tmp_inode); out_stop: ext4_journal_stop(handle); out_tmp_inode: unlock_new_inode(tmp_inode); iput(tmp_inode); out_unlock: ext4_writepages_up_write(inode->i_sb, alloc_ctx); return retval; } /* * Migrate a simple extent-based inode to use the i_blocks[] array */ int ext4_ind_migrate(struct inode *inode) { struct ext4_extent_header *eh; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_super_block *es = sbi->s_es; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_extent *ex; unsigned int i, len; ext4_lblk_t start, end; ext4_fsblk_t blk; handle_t *handle; int ret, ret2 = 0; int alloc_ctx; if (!ext4_has_feature_extents(inode->i_sb) || (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EINVAL; if (ext4_has_feature_bigalloc(inode->i_sb)) return -EOPNOTSUPP; /* * In order to get correct extent info, force all delayed allocation * blocks to be allocated, otherwise delayed allocation blocks may not * be reflected and bypass the checks on extent header. */ if (test_opt(inode->i_sb, DELALLOC)) ext4_alloc_da_blocks(inode); alloc_ctx = ext4_writepages_down_write(inode->i_sb); handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_unlock; } down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_check_inode(inode); if (ret) goto errout; eh = ext_inode_hdr(inode); ex = EXT_FIRST_EXTENT(eh); if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) { ret = -EOPNOTSUPP; goto errout; } if (eh->eh_entries == 0) blk = len = start = end = 0; else { len = le16_to_cpu(ex->ee_len); blk = ext4_ext_pblock(ex); start = le32_to_cpu(ex->ee_block); end = start + len - 1; if (end >= EXT4_NDIR_BLOCKS) { ret = -EOPNOTSUPP; goto errout; } } ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); memset(ei->i_data, 0, sizeof(ei->i_data)); for (i = start; i <= end; i++) ei->i_data[i] = cpu_to_le32(blk++); ret2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret2 && !ret)) ret = ret2; errout: ext4_journal_stop(handle); up_write(&EXT4_I(inode)->i_data_sem); out_unlock: ext4_writepages_up_write(inode->i_sb, alloc_ctx); return ret; }
2 2 1 2 2 1 1 2 2 2 2 2 1 1 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 // SPDX-License-Identifier: GPL-2.0 /* net/sched/sch_etf.c Earliest TxTime First queueing discipline. * * Authors: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com> * Vinicius Costa Gomes <vinicius.gomes@intel.com> */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/rbtree.h> #include <linux/skbuff.h> #include <linux/posix-timers.h> #include <net/netlink.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/sock.h> #define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) #define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON) #define SKIP_SOCK_CHECK_IS_SET(x) ((x)->flags & TC_ETF_SKIP_SOCK_CHECK) struct etf_sched_data { bool offload; bool deadline_mode; bool skip_sock_check; int clockid; int queue; s32 delta; /* in ns */ ktime_t last; /* The txtime of the last skb sent to the netdevice. */ struct rb_root_cached head; struct qdisc_watchdog watchdog; ktime_t (*get_time)(void); }; static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = { [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) }, }; static inline int validate_input_params(struct tc_etf_qopt *qopt, struct netlink_ext_ack *extack) { /* Check if params comply to the following rules: * * Clockid and delta must be valid. * * * Dynamic clockids are not supported. * * * Delta must be a positive integer. * * Also note that for the HW offload case, we must * expect that system clocks have been synchronized to PHC. */ if (qopt->clockid < 0) { NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported"); return -ENOTSUPP; } if (qopt->clockid != CLOCK_TAI) { NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used"); return -EINVAL; } if (qopt->delta < 0) { NL_SET_ERR_MSG(extack, "Delta must be positive"); return -EINVAL; } return 0; } static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) { struct etf_sched_data *q = qdisc_priv(sch); ktime_t txtime = nskb->tstamp; struct sock *sk = nskb->sk; ktime_t now; if (q->skip_sock_check) goto skip; if (!sk || !sk_fullsock(sk)) return false; if (!sock_flag(sk, SOCK_TXTIME)) return false; /* We don't perform crosstimestamping. * Drop if packet's clockid differs from qdisc's. */ if (sk->sk_clockid != q->clockid) return false; if (sk->sk_txtime_deadline_mode != q->deadline_mode) return false; skip: now = q->get_time(); if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) return false; return true; } static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct rb_node *p; p = rb_first_cached(&q->head); if (!p) return NULL; return rb_to_skb(p); } static void reset_watchdog(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = etf_peek_timesortedlist(sch); ktime_t next; if (!skb) { qdisc_watchdog_cancel(&q->watchdog); return; } next = ktime_sub_ns(skb->tstamp, q->delta); qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); } static void report_sock_error(struct sk_buff *skb, u32 err, u8 code) { struct sock_exterr_skb *serr; struct sk_buff *clone; ktime_t txtime = skb->tstamp; struct sock *sk = skb->sk; if (!sk || !sk_fullsock(sk) || !(sk->sk_txtime_report_errors)) return; clone = skb_clone(skb, GFP_ATOMIC); if (!clone) return; serr = SKB_EXT_ERR(clone); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME; serr->ee.ee_type = 0; serr->ee.ee_code = code; serr->ee.ee_pad = 0; serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */ serr->ee.ee_info = txtime; /* low part of tstamp */ if (sock_queue_err_skb(sk, clone)) kfree_skb(clone); } static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, struct sk_buff **to_free) { struct etf_sched_data *q = qdisc_priv(sch); struct rb_node **p = &q->head.rb_root.rb_node, *parent = NULL; ktime_t txtime = nskb->tstamp; bool leftmost = true; if (!is_packet_valid(sch, nskb)) { report_sock_error(nskb, EINVAL, SO_EE_CODE_TXTIME_INVALID_PARAM); return qdisc_drop(nskb, sch, to_free); } while (*p) { struct sk_buff *skb; parent = *p; skb = rb_to_skb(parent); if (ktime_compare(txtime, skb->tstamp) >= 0) { p = &parent->rb_right; leftmost = false; } else { p = &parent->rb_left; } } rb_link_node(&nskb->rbnode, parent, p); rb_insert_color_cached(&nskb->rbnode, &q->head, leftmost); qdisc_qstats_backlog_inc(sch, nskb); sch->q.qlen++; /* Now we may need to re-arm the qdisc watchdog for the next packet. */ reset_watchdog(sch); return NET_XMIT_SUCCESS; } static void timesortedlist_drop(struct Qdisc *sch, struct sk_buff *skb, ktime_t now) { struct etf_sched_data *q = qdisc_priv(sch); struct sk_buff *to_free = NULL; struct sk_buff *tmp = NULL; skb_rbtree_walk_from_safe(skb, tmp) { if (ktime_after(skb->tstamp, now)) break; rb_erase_cached(&skb->rbnode, &q->head); /* The rbnode field in the skb re-uses these fields, now that * we are done with the rbnode, reset them. */ skb->next = NULL; skb->prev = NULL; skb->dev = qdisc_dev(sch); report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch, &to_free); qdisc_qstats_overlimit(sch); sch->q.qlen--; } kfree_skb_list(to_free); } static void timesortedlist_remove(struct Qdisc *sch, struct sk_buff *skb) { struct etf_sched_data *q = qdisc_priv(sch); rb_erase_cached(&skb->rbnode, &q->head); /* The rbnode field in the skb re-uses these fields, now that * we are done with the rbnode, reset them. */ skb->next = NULL; skb->prev = NULL; skb->dev = qdisc_dev(sch); qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); q->last = skb->tstamp; sch->q.qlen--; } static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; ktime_t now, next; skb = etf_peek_timesortedlist(sch); if (!skb) return NULL; now = q->get_time(); /* Drop if packet has expired while in queue. */ if (ktime_before(skb->tstamp, now)) { timesortedlist_drop(sch, skb, now); skb = NULL; goto out; } /* When in deadline mode, dequeue as soon as possible and change the * txtime from deadline to (now + delta). */ if (q->deadline_mode) { timesortedlist_remove(sch, skb); skb->tstamp = now; goto out; } next = ktime_sub_ns(skb->tstamp, q->delta); /* Dequeue only if now is within the [txtime - delta, txtime] range. */ if (ktime_after(now, next)) timesortedlist_remove(sch, skb); else skb = NULL; out: /* Now we may need to re-arm the qdisc watchdog for the next packet. */ reset_watchdog(sch); return skb; } static void etf_disable_offload(struct net_device *dev, struct etf_sched_data *q) { struct tc_etf_qopt_offload etf = { }; const struct net_device_ops *ops; int err; if (!q->offload) return; ops = dev->netdev_ops; if (!ops->ndo_setup_tc) return; etf.queue = q->queue; etf.enable = 0; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); if (err < 0) pr_warn("Couldn't disable ETF offload for queue %d\n", etf.queue); } static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_etf_qopt_offload etf = { }; int err; if (!ops->ndo_setup_tc) { NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload"); return -EOPNOTSUPP; } etf.queue = q->queue; etf.enable = 1; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); if (err < 0) { NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload"); return err; } return 0; } static int etf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct etf_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct nlattr *tb[TCA_ETF_MAX + 1]; struct tc_etf_qopt *qopt; int err; if (!opt) { NL_SET_ERR_MSG(extack, "Missing ETF qdisc options which are mandatory"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_ETF_MAX, opt, etf_policy, extack); if (err < 0) return err; if (!tb[TCA_ETF_PARMS]) { NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters"); return -EINVAL; } qopt = nla_data(tb[TCA_ETF_PARMS]); pr_debug("delta %d clockid %d offload %s deadline %s\n", qopt->delta, qopt->clockid, OFFLOAD_IS_ON(qopt) ? "on" : "off", DEADLINE_MODE_IS_ON(qopt) ? "on" : "off"); err = validate_input_params(qopt, extack); if (err < 0) return err; q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); if (OFFLOAD_IS_ON(qopt)) { err = etf_enable_offload(dev, q, extack); if (err < 0) return err; } /* Everything went OK, save the parameters used. */ q->delta = qopt->delta; q->clockid = qopt->clockid; q->offload = OFFLOAD_IS_ON(qopt); q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); q->skip_sock_check = SKIP_SOCK_CHECK_IS_SET(qopt); switch (q->clockid) { case CLOCK_REALTIME: q->get_time = ktime_get_real; break; case CLOCK_MONOTONIC: q->get_time = ktime_get; break; case CLOCK_BOOTTIME: q->get_time = ktime_get_boottime; break; case CLOCK_TAI: q->get_time = ktime_get_clocktai; break; default: NL_SET_ERR_MSG(extack, "Clockid is not supported"); return -ENOTSUPP; } qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid); return 0; } static void timesortedlist_clear(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct rb_node *p = rb_first_cached(&q->head); while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); rb_erase_cached(&skb->rbnode, &q->head); rtnl_kfree_skbs(skb, skb); sch->q.qlen--; } } static void etf_reset(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); /* Only cancel watchdog if it's been initialized. */ if (q->watchdog.qdisc == sch) qdisc_watchdog_cancel(&q->watchdog); /* No matter which mode we are on, it's safe to clear both lists. */ timesortedlist_clear(sch); __qdisc_reset_queue(&sch->q); q->last = 0; } static void etf_destroy(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); /* Only cancel watchdog if it's been initialized. */ if (q->watchdog.qdisc == sch) qdisc_watchdog_cancel(&q->watchdog); etf_disable_offload(dev, q); } static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) { struct etf_sched_data *q = qdisc_priv(sch); struct tc_etf_qopt opt = { }; struct nlattr *nest; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto nla_put_failure; opt.delta = READ_ONCE(q->delta); opt.clockid = READ_ONCE(q->clockid); if (READ_ONCE(q->offload)) opt.flags |= TC_ETF_OFFLOAD_ON; if (READ_ONCE(q->deadline_mode)) opt.flags |= TC_ETF_DEADLINE_MODE_ON; if (READ_ONCE(q->skip_sock_check)) opt.flags |= TC_ETF_SKIP_SOCK_CHECK; if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static struct Qdisc_ops etf_qdisc_ops __read_mostly = { .id = "etf", .priv_size = sizeof(struct etf_sched_data), .enqueue = etf_enqueue_timesortedlist, .dequeue = etf_dequeue_timesortedlist, .peek = etf_peek_timesortedlist, .init = etf_init, .reset = etf_reset, .destroy = etf_destroy, .dump = etf_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("etf"); static int __init etf_module_init(void) { return register_qdisc(&etf_qdisc_ops); } static void __exit etf_module_exit(void) { unregister_qdisc(&etf_qdisc_ops); } module_init(etf_module_init) module_exit(etf_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Earliest TxTime First (ETF) qdisc");
12 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Trace point definitions for the RDMA Connect Manager. * * Author: Chuck Lever <chuck.lever@oracle.com> * * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM rdma_cma #if !defined(_TRACE_RDMA_CMA_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RDMA_CMA_H #include <linux/tracepoint.h> #include <trace/misc/rdma.h> DECLARE_EVENT_CLASS(cma_fsm_class, TP_PROTO( const struct rdma_id_private *id_priv ), TP_ARGS(id_priv), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos ) ); #define DEFINE_CMA_FSM_EVENT(name) \ DEFINE_EVENT(cma_fsm_class, cm_##name, \ TP_PROTO( \ const struct rdma_id_private *id_priv \ ), \ TP_ARGS(id_priv)) DEFINE_CMA_FSM_EVENT(send_rtu); DEFINE_CMA_FSM_EVENT(send_rej); DEFINE_CMA_FSM_EVENT(send_mra); DEFINE_CMA_FSM_EVENT(send_sidr_req); DEFINE_CMA_FSM_EVENT(send_sidr_rep); DEFINE_CMA_FSM_EVENT(disconnect); DEFINE_CMA_FSM_EVENT(sent_drep); DEFINE_CMA_FSM_EVENT(sent_dreq); DEFINE_CMA_FSM_EVENT(id_destroy); TRACE_EVENT(cm_id_attach, TP_PROTO( const struct rdma_id_private *id_priv, const struct ib_device *device ), TP_ARGS(id_priv, device), TP_STRUCT__entry( __field(u32, cm_id) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) __string(devname, device->name) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); __assign_str(devname); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc device=%s", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __get_str(devname) ) ); DECLARE_EVENT_CLASS(cma_qp_class, TP_PROTO( const struct rdma_id_private *id_priv ), TP_ARGS(id_priv), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(u32, qp_num) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->qp_num = id_priv->qp_num; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u qp_num=%u", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, __entry->qp_num ) ); #define DEFINE_CMA_QP_EVENT(name) \ DEFINE_EVENT(cma_qp_class, cm_##name, \ TP_PROTO( \ const struct rdma_id_private *id_priv \ ), \ TP_ARGS(id_priv)) DEFINE_CMA_QP_EVENT(send_req); DEFINE_CMA_QP_EVENT(send_rep); DEFINE_CMA_QP_EVENT(qp_destroy); /* * enum ib_wp_type, from include/rdma/ib_verbs.h */ #define IB_QP_TYPE_LIST \ ib_qp_type(SMI) \ ib_qp_type(GSI) \ ib_qp_type(RC) \ ib_qp_type(UC) \ ib_qp_type(UD) \ ib_qp_type(RAW_IPV6) \ ib_qp_type(RAW_ETHERTYPE) \ ib_qp_type(RAW_PACKET) \ ib_qp_type(XRC_INI) \ ib_qp_type_end(XRC_TGT) #undef ib_qp_type #undef ib_qp_type_end #define ib_qp_type(x) TRACE_DEFINE_ENUM(IB_QPT_##x); #define ib_qp_type_end(x) TRACE_DEFINE_ENUM(IB_QPT_##x); IB_QP_TYPE_LIST #undef ib_qp_type #undef ib_qp_type_end #define ib_qp_type(x) { IB_QPT_##x, #x }, #define ib_qp_type_end(x) { IB_QPT_##x, #x } #define rdma_show_qp_type(x) \ __print_symbolic(x, IB_QP_TYPE_LIST) TRACE_EVENT(cm_qp_create, TP_PROTO( const struct rdma_id_private *id_priv, const struct ib_pd *pd, const struct ib_qp_init_attr *qp_init_attr, int rc ), TP_ARGS(id_priv, pd, qp_init_attr, rc), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, pd_id) __field(u32, tos) __field(u32, qp_num) __field(u32, send_wr) __field(u32, recv_wr) __field(int, rc) __field(unsigned long, qp_type) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->pd_id = pd->res.id; __entry->tos = id_priv->tos; __entry->send_wr = qp_init_attr->cap.max_send_wr; __entry->recv_wr = qp_init_attr->cap.max_recv_wr; __entry->rc = rc; if (!rc) { __entry->qp_num = id_priv->qp_num; __entry->qp_type = id_priv->id.qp_type; } else { __entry->qp_num = 0; __entry->qp_type = 0; } memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u pd.id=%u qp_type=%s" " send_wr=%u recv_wr=%u qp_num=%u rc=%d", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, __entry->pd_id, rdma_show_qp_type(__entry->qp_type), __entry->send_wr, __entry->recv_wr, __entry->qp_num, __entry->rc ) ); TRACE_EVENT(cm_req_handler, TP_PROTO( const struct rdma_id_private *id_priv, int event ), TP_ARGS(id_priv, event), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(unsigned long, event) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->event = event; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s (%lu)", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, rdma_show_ib_cm_event(__entry->event), __entry->event ) ); TRACE_EVENT(cm_event_handler, TP_PROTO( const struct rdma_id_private *id_priv, const struct rdma_cm_event *event ), TP_ARGS(id_priv, event), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(unsigned long, event) __field(int, status) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->event = event->event; __entry->status = event->status; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s (%lu/%d)", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, rdma_show_cm_event(__entry->event), __entry->event, __entry->status ) ); TRACE_EVENT(cm_event_done, TP_PROTO( const struct rdma_id_private *id_priv, const struct rdma_cm_event *event, int result ), TP_ARGS(id_priv, event, result), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(unsigned long, event) __field(int, result) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->event = event->event; __entry->result = result; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s consumer returns %d", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, rdma_show_cm_event(__entry->event), __entry->result ) ); DECLARE_EVENT_CLASS(cma_client_class, TP_PROTO( const struct ib_device *device ), TP_ARGS(device), TP_STRUCT__entry( __string(name, device->name) ), TP_fast_assign( __assign_str(name); ), TP_printk("device name=%s", __get_str(name) ) ); #define DEFINE_CMA_CLIENT_EVENT(name) \ DEFINE_EVENT(cma_client_class, cm_##name, \ TP_PROTO( \ const struct ib_device *device \ ), \ TP_ARGS(device)) DEFINE_CMA_CLIENT_EVENT(add_one); DEFINE_CMA_CLIENT_EVENT(remove_one); #endif /* _TRACE_RDMA_CMA_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE cma_trace #include <trace/define_trace.h>
4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 // SPDX-License-Identifier: GPL-2.0 /* * Support for async notification of waitid */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "cancel.h" #include "waitid.h" #include "../kernel/exit.h" static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts); #define IO_WAITID_CANCEL_FLAG BIT(31) #define IO_WAITID_REF_MASK GENMASK(30, 0) struct io_waitid { struct file *file; int which; pid_t upid; int options; atomic_t refs; struct wait_queue_head *head; struct siginfo __user *infop; struct waitid_info info; }; static void io_waitid_free(struct io_kiocb *req) { struct io_waitid_async *iwa = req->async_data; put_pid(iwa->wo.wo_pid); kfree(req->async_data); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; } #ifdef CONFIG_COMPAT static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo) { struct compat_siginfo __user *infop; bool ret; infop = (struct compat_siginfo __user *) iw->infop; if (!user_write_access_begin(infop, sizeof(*infop))) return false; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(iw->info.cause, &infop->si_code, Efault); unsafe_put_user(iw->info.pid, &infop->si_pid, Efault); unsafe_put_user(iw->info.uid, &infop->si_uid, Efault); unsafe_put_user(iw->info.status, &infop->si_status, Efault); ret = true; done: user_write_access_end(); return ret; Efault: ret = false; goto done; } #endif static bool io_waitid_copy_si(struct io_kiocb *req, int signo) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); bool ret; if (!iw->infop) return true; #ifdef CONFIG_COMPAT if (req->ctx->compat) return io_waitid_compat_copy_si(iw, signo); #endif if (!user_write_access_begin(iw->infop, sizeof(*iw->infop))) return false; unsafe_put_user(signo, &iw->infop->si_signo, Efault); unsafe_put_user(0, &iw->infop->si_errno, Efault); unsafe_put_user(iw->info.cause, &iw->infop->si_code, Efault); unsafe_put_user(iw->info.pid, &iw->infop->si_pid, Efault); unsafe_put_user(iw->info.uid, &iw->infop->si_uid, Efault); unsafe_put_user(iw->info.status, &iw->infop->si_status, Efault); ret = true; done: user_write_access_end(); return ret; Efault: ret = false; goto done; } static int io_waitid_finish(struct io_kiocb *req, int ret) { int signo = 0; if (ret > 0) { signo = SIGCHLD; ret = 0; } if (!io_waitid_copy_si(req, signo)) ret = -EFAULT; io_waitid_free(req); return ret; } static void io_waitid_complete(struct io_kiocb *req, int ret) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_tw_state ts = {}; /* anyone completing better be holding a reference */ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK)); lockdep_assert_held(&req->ctx->uring_lock); hlist_del_init(&req->hash_node); ret = io_waitid_finish(req, ret); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); io_req_task_complete(req, &ts); } static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_waitid_async *iwa = req->async_data; /* * Mark us canceled regardless of ownership. This will prevent a * potential retry from a spurious wakeup. */ atomic_or(IO_WAITID_CANCEL_FLAG, &iw->refs); /* claim ownership */ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) return false; spin_lock_irq(&iw->head->lock); list_del_init(&iwa->wo.child_wait.entry); spin_unlock_irq(&iw->head->lock); io_waitid_complete(req, -ECANCELED); return true; } int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags) { struct hlist_node *tmp; struct io_kiocb *req; int nr = 0; if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) return -ENOENT; io_ring_submit_lock(ctx, issue_flags); hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { if (req->cqe.user_data != cd->data && !(cd->flags & IORING_ASYNC_CANCEL_ANY)) continue; if (__io_waitid_cancel(ctx, req)) nr++; if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) break; } io_ring_submit_unlock(ctx, issue_flags); if (nr) return nr; return -ENOENT; } bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) { struct hlist_node *tmp; struct io_kiocb *req; bool found = false; lockdep_assert_held(&ctx->uring_lock); hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { if (!io_match_task_safe(req, task, cancel_all)) continue; hlist_del_init(&req->hash_node); __io_waitid_cancel(ctx, req); found = true; } return found; } static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_waitid_async *iwa = req->async_data; if (!atomic_sub_return(1, &iw->refs)) return false; /* * Wakeup triggered, racing with us. It was prevented from * completing because of that, queue up the tw to do that. */ req->io_task_work.func = io_waitid_cb; io_req_task_work_add(req); remove_wait_queue(iw->head, &iwa->wo.child_wait); return true; } static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) { struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; io_tw_lock(ctx, ts); ret = __do_wait(&iwa->wo); /* * If we get -ERESTARTSYS here, we need to re-arm and check again * to ensure we get another callback. If the retry works, then we can * just remove ourselves from the waitqueue again and finish the * request. */ if (unlikely(ret == -ERESTARTSYS)) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); /* Don't retry if cancel found it meanwhile */ ret = -ECANCELED; if (!(atomic_read(&iw->refs) & IO_WAITID_CANCEL_FLAG)) { iw->head = &current->signal->wait_chldexit; add_wait_queue(iw->head, &iwa->wo.child_wait); ret = __do_wait(&iwa->wo); if (ret == -ERESTARTSYS) { /* retry armed, drop our ref */ io_waitid_drop_issue_ref(req); return; } remove_wait_queue(iw->head, &iwa->wo.child_wait); } } io_waitid_complete(req, ret); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct io_waitid_async *iwa = container_of(wo, struct io_waitid_async, wo); struct io_kiocb *req = iwa->req; struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct task_struct *p = key; if (!pid_child_should_wake(wo, p)) return 0; /* cancel is in progress */ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) return 1; req->io_task_work.func = io_waitid_cb; io_req_task_work_add(req); list_del_init(&wait->entry); return 1; } int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags) return -EINVAL; iw->which = READ_ONCE(sqe->len); iw->upid = READ_ONCE(sqe->fd); iw->options = READ_ONCE(sqe->file_index); iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2)); return 0; } int io_waitid(struct io_kiocb *req, unsigned int issue_flags) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_ring_ctx *ctx = req->ctx; struct io_waitid_async *iwa; int ret; if (io_alloc_async_data(req)) return -ENOMEM; iwa = req->async_data; iwa->req = req; ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info, iw->options, NULL); if (ret) goto done; /* * Mark the request as busy upfront, in case we're racing with the * wakeup. If we are, then we'll notice when we drop this initial * reference again after arming. */ atomic_set(&iw->refs, 1); /* * Cancel must hold the ctx lock, so there's no risk of cancelation * finding us until a) we remain on the list, and b) the lock is * dropped. We only need to worry about racing with the wakeup * callback. */ io_ring_submit_lock(ctx, issue_flags); hlist_add_head(&req->hash_node, &ctx->waitid_list); init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait); iwa->wo.child_wait.private = req->task; iw->head = &current->signal->wait_chldexit; add_wait_queue(iw->head, &iwa->wo.child_wait); ret = __do_wait(&iwa->wo); if (ret == -ERESTARTSYS) { /* * Nobody else grabbed a reference, it'll complete when we get * a waitqueue callback, or if someone cancels it. */ if (!io_waitid_drop_issue_ref(req)) { io_ring_submit_unlock(ctx, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } /* * Wakeup triggered, racing with us. It was prevented from * completing because of that, queue up the tw to do that. */ io_ring_submit_unlock(ctx, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } hlist_del_init(&req->hash_node); remove_wait_queue(iw->head, &iwa->wo.child_wait); ret = io_waitid_finish(req, ret); io_ring_submit_unlock(ctx, issue_flags); done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; }
40 41 41 11 10 11 11 108 136 141 140 138 2 2 2 2 2 28 28 28 28 28 28 195 136 90 90 90 71 71 71 90 103 20 89 89 3 3 1 2 2 1 1 1 3 2 2 2 2 2 2 10 10 10 10 1 1 1 10 9 8 56 54 55 56 56 56 56 56 56 7 7 3 1 2 1 5 4 5 5 1 4 1 1 2 3 2 1 1 3 3 3 3 2 1 2 1 1 1 1 1 1 8 8 7 7 2 3 3 3 7 8 2 1 1 1 1 2 26 26 26 26 26 23 3 1 4 4 1 1 2 5 5 4 3 11 10 2 25 2 1 2 22 1 23 25 3 2 2 2 3 21 20 21 5 23 5 1 6 6 5 2 4 4 4 4 4 4 4 3 2 2 2 6 1 3 2 2 6 6 6 6 3 3 3 3 3 3 3 3 3 3 3 6 17 17 17 17 17 17 12 14 12 1 11 9 1 9 6 5 5 5 12 5 14 1 1 14 12 12 12 12 14 8 8 2 2 2 2 2 2 1 2 1 2 1 2 2 2 8 2 14 14 11 12 12 12 14 14 14 14 9 1 1 1 1 1 1 1 1 2 2 1 1 1 2 4 4 63 28 39 63 17 12 14 13 7 17 247 22 246 247 234 44 2 44 35 57 57 3 57 14 19 19 35 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 Forwarding Information Base: FIB frontend. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/arp.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <net/l3mdev.h> #include <net/lwtunnel.h> #include <trace/events/fib.h> #ifndef CONFIG_IP_MULTIPLE_TABLES static int __net_init fib4_rules_init(struct net *net) { struct fib_table *local_table, *main_table; main_table = fib_trie_table(RT_TABLE_MAIN, NULL); if (!main_table) return -ENOMEM; local_table = fib_trie_table(RT_TABLE_LOCAL, main_table); if (!local_table) goto fail; hlist_add_head_rcu(&local_table->tb_hlist, &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]); hlist_add_head_rcu(&main_table->tb_hlist, &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]); return 0; fail: fib_free_table(main_table); return -ENOMEM; } #else struct fib_table *fib_new_table(struct net *net, u32 id) { struct fib_table *tb, *alias = NULL; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; tb = fib_get_table(net, id); if (tb) return tb; if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules) alias = fib_new_table(net, RT_TABLE_MAIN); tb = fib_trie_table(id, alias); if (!tb) return NULL; switch (id) { case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, tb); break; case RT_TABLE_DEFAULT: rcu_assign_pointer(net->ipv4.fib_default, tb); break; default: break; } h = id & (FIB_TABLE_HASHSZ - 1); hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); return tb; } EXPORT_SYMBOL_GPL(fib_new_table); /* caller must hold either rtnl or rcu read lock */ struct fib_table *fib_get_table(struct net *net, u32 id) { struct fib_table *tb; struct hlist_head *head; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; h = id & (FIB_TABLE_HASHSZ - 1); head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist, lockdep_rtnl_is_held()) { if (tb->tb_id == id) return tb; } return NULL; } #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, struct fib_table *new) { #ifdef CONFIG_IP_MULTIPLE_TABLES switch (new->tb_id) { case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, new); break; case RT_TABLE_DEFAULT: rcu_assign_pointer(net->ipv4.fib_default, new); break; default: break; } #endif /* replace the old table in the hlist */ hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist); } int fib_unmerge(struct net *net) { struct fib_table *old, *new, *main_table; /* attempt to fetch local table if it has been allocated */ old = fib_get_table(net, RT_TABLE_LOCAL); if (!old) return 0; new = fib_trie_unmerge(old); if (!new) return -ENOMEM; /* table is already unmerged */ if (new == old) return 0; /* replace merged table with clean table */ fib_replace_table(net, old, new); fib_free_table(old); /* attempt to fetch main table if it has been allocated */ main_table = fib_get_table(net, RT_TABLE_MAIN); if (!main_table) return 0; /* flush local entries from main table */ fib_table_flush_external(main_table); return 0; } void fib_flush(struct net *net) { int flushed = 0; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct hlist_node *tmp; struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) flushed += fib_table_flush(net, tb, false); } if (flushed) rt_cache_flush(net); } /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. */ static inline unsigned int __inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr, u32 tb_id) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; unsigned int ret = RTN_BROADCAST; struct fib_table *table; if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) return RTN_BROADCAST; if (ipv4_is_multicast(addr)) return RTN_MULTICAST; rcu_read_lock(); table = fib_get_table(net, tb_id); if (table) { ret = RTN_UNICAST; if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) { struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0); if (!dev || dev == nhc->nhc_dev) ret = res.type; } } rcu_read_unlock(); return ret; } unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id) { return __inet_dev_addr_type(net, NULL, addr, tb_id); } EXPORT_SYMBOL(inet_addr_type_table); unsigned int inet_addr_type(struct net *net, __be32 addr) { return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL); } EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, dev, addr, rt_table); } EXPORT_SYMBOL(inet_dev_addr_type); /* inet_addr_type with dev == NULL but using the table from a dev * if one is associated */ unsigned int inet_addr_type_dev_table(struct net *net, const struct net_device *dev, __be32 addr) { u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, NULL, addr, rt_table); } EXPORT_SYMBOL(inet_addr_type_dev_table); __be32 fib_compute_spec_dst(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev; struct fib_result res; struct rtable *rt; struct net *net; int scope; rt = skb_rtable(skb); if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) == RTCF_LOCAL) return ip_hdr(skb)->daddr; in_dev = __in_dev_get_rcu(dev); net = dev_net(dev); scope = RT_SCOPE_UNIVERSE; if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev); struct flowi4 fl4 = { .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK, .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; if (!fib_lookup(net, &fl4, &res, 0)) return fib_result_prefsrc(net, &res); } else { scope = RT_SCOPE_LINK; } return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) { bool dev_match = false; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (unlikely(fi->nh)) { dev_match = nexthop_uses_dev(fi->nh, dev); } else { int ret; for (ret = 0; ret < fib_info_num_path(fi); ret++) { const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); if (nhc_l3mdev_matches_dev(nhc, dev)) { dev_match = true; break; } } } #else if (fib_info_nhc(fi, 0)->nhc_dev == dev) dev_match = true; #endif return dev_match; } EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev); /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. * - figure out what "logical" interface this packet arrived * and calculate "specific destination" address. * - check, that packet arrived from expected physical interface. * called with rcu_read_lock() */ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, int rpf, struct in_device *idev, u32 *itag) { struct net *net = dev_net(dev); struct flow_keys flkeys; int ret, no_addr; struct fib_result res; struct flowi4 fl4; bool dev_match; fl4.flowi4_oif = 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev); fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_multipath_hash = 0; no_addr = idev->ifa_list == NULL; fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) { fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; } else { swap(fl4.fl4_sport, fl4.fl4_dport); } if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) goto e_inval; fib_combine_itag(itag, &res); dev_match = fib_info_nh_uses_dev(res.fi, dev); /* This is not common, loopback packets retain skb_dst so normally they * would not even hit this slow path. */ dev_match = dev_match || (res.type == RTN_LOCAL && dev == net->loopback_dev); if (dev_match) { ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; return ret; } if (no_addr) goto last_resort; if (rpf == 1) goto e_rpf; fl4.flowi4_oif = dev->ifindex; ret = 0; if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; } return ret; last_resort: if (rpf) goto e_rpf; *itag = 0; return 0; e_inval: return -EINVAL; e_rpf: return -EXDEV; } /* Ignore rp_filter for packets protected by IPsec. */ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, struct in_device *idev, u32 *itag) { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); struct net *net = dev_net(dev); if (!r && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { if (IN_DEV_ACCEPT_LOCAL(idev)) goto ok; /* with custom local routes in place, checking local addresses * only will be too optimistic, with custom rules, checking * local addresses only can be too strict, e.g. due to vrf */ if (net->ipv4.fib_has_custom_local_routes || fib4_has_custom_rules(net)) goto full_check; /* Within the same container, it is regarded as a martian source, * and the same host but different containers are not. */ if (inet_lookup_ifaddr_rcu(net, src)) return -EINVAL; ok: *itag = 0; return 0; } full_check: return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); } static inline __be32 sk_extract_addr(struct sockaddr *addr) { return ((struct sockaddr_in *) addr)->sin_addr.s_addr; } static int put_rtax(struct nlattr *mx, int len, int type, u32 value) { struct nlattr *nla; nla = (struct nlattr *) ((char *) mx + len); nla->nla_type = type; nla->nla_len = nla_attr_size(4); *(u32 *) nla_data(nla) = value; return len + nla_total_size(4); } static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, struct fib_config *cfg) { __be32 addr; int plen; memset(cfg, 0, sizeof(*cfg)); cfg->fc_nlinfo.nl_net = net; if (rt->rt_dst.sa_family != AF_INET) return -EAFNOSUPPORT; /* * Check mask for validity: * a) it must be contiguous. * b) destination must have all host bits clear. * c) if application forgot to set correct family (AF_INET), * reject request unless it is absolutely clear i.e. * both family and mask are zero. */ plen = 32; addr = sk_extract_addr(&rt->rt_dst); if (!(rt->rt_flags & RTF_HOST)) { __be32 mask = sk_extract_addr(&rt->rt_genmask); if (rt->rt_genmask.sa_family != AF_INET) { if (mask || rt->rt_genmask.sa_family) return -EAFNOSUPPORT; } if (bad_mask(mask, addr)) return -EINVAL; plen = inet_mask_len(mask); } cfg->fc_dst_len = plen; cfg->fc_dst = addr; if (cmd != SIOCDELRT) { cfg->fc_nlflags = NLM_F_CREATE; cfg->fc_protocol = RTPROT_BOOT; } if (rt->rt_metric) cfg->fc_priority = rt->rt_metric - 1; if (rt->rt_flags & RTF_REJECT) { cfg->fc_scope = RT_SCOPE_HOST; cfg->fc_type = RTN_UNREACHABLE; return 0; } cfg->fc_scope = RT_SCOPE_NOWHERE; cfg->fc_type = RTN_UNICAST; if (rt->rt_dev) { char *colon; struct net_device *dev; char devname[IFNAMSIZ]; if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1)) return -EFAULT; devname[IFNAMSIZ-1] = 0; colon = strchr(devname, ':'); if (colon) *colon = 0; dev = __dev_get_by_name(net, devname); if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; cfg->fc_table = l3mdev_fib_table(dev); if (colon) { const struct in_ifaddr *ifa; struct in_device *in_dev; in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return -ENODEV; *colon = ':'; rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; } rcu_read_unlock(); if (!ifa) return -ENODEV; cfg->fc_prefsrc = ifa->ifa_local; } } addr = sk_extract_addr(&rt->rt_gateway); if (rt->rt_gateway.sa_family == AF_INET && addr) { unsigned int addr_type; cfg->fc_gw4 = addr; cfg->fc_gw_family = AF_INET; addr_type = inet_addr_type_table(net, addr, cfg->fc_table); if (rt->rt_flags & RTF_GATEWAY && addr_type == RTN_UNICAST) cfg->fc_scope = RT_SCOPE_UNIVERSE; } if (!cfg->fc_table) cfg->fc_table = RT_TABLE_MAIN; if (cmd == SIOCDELRT) return 0; if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family) return -EINVAL; if (cfg->fc_scope == RT_SCOPE_NOWHERE) cfg->fc_scope = RT_SCOPE_LINK; if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) { struct nlattr *mx; int len = 0; mx = kcalloc(3, nla_total_size(4), GFP_KERNEL); if (!mx) return -ENOMEM; if (rt->rt_flags & RTF_MTU) len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40); if (rt->rt_flags & RTF_WINDOW) len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window); if (rt->rt_flags & RTF_IRTT) len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3); cfg->fc_mx = mx; cfg->fc_mx_len = len; } return 0; } /* * Handle IP routing ioctl calls. * These are used to manipulate the routing tables */ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) { struct fib_config cfg; int err; switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtnl_lock(); err = rtentry_to_fib_config(net, cmd, rt, &cfg); if (err == 0) { struct fib_table *tb; if (cmd == SIOCDELRT) { tb = fib_get_table(net, cfg.fc_table); if (tb) err = fib_table_delete(net, tb, &cfg, NULL); else err = -ESRCH; } else { tb = fib_new_table(net, cfg.fc_table); if (tb) err = fib_table_insert(net, tb, &cfg, NULL); else err = -ENOBUFS; } /* allocated by rtentry_to_fib_config() */ kfree(cfg.fc_mx); } rtnl_unlock(); return err; } return -EINVAL; } const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_DST] = { .type = NLA_U32 }, [RTA_SRC] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_GATEWAY] = { .type = NLA_U32 }, [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_PREFSRC] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_FLOW] = { .type = NLA_U32 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_UID] = { .type = NLA_U32 }, [RTA_MARK] = { .type = NLA_U32 }, [RTA_TABLE] = { .type = NLA_U32 }, [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, }; int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, struct netlink_ext_ack *extack) { struct rtvia *via; int alen; if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA"); return -EINVAL; } via = nla_data(nla); alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); switch (via->rtvia_family) { case AF_INET: if (alen != sizeof(__be32)) { NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA"); return -EINVAL; } cfg->fc_gw_family = AF_INET; cfg->fc_gw4 = *((__be32 *)via->rtvia_addr); break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) if (alen != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA"); return -EINVAL; } cfg->fc_gw_family = AF_INET6; cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr); #else NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); return -EINVAL; #endif break; default: NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA"); return -EINVAL; } return 0; } static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_config *cfg, struct netlink_ext_ack *extack) { bool has_gw = false, has_via = false; struct nlattr *attr; int err, remaining; struct rtmsg *rtm; err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy, extack); if (err < 0) goto errout; memset(cfg, 0, sizeof(*cfg)); rtm = nlmsg_data(nlh); if (!inet_validate_dscp(rtm->rtm_tos)) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): ECN bits must be 0"); err = -EINVAL; goto errout; } cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos); cfg->fc_dst_len = rtm->rtm_dst_len; cfg->fc_table = rtm->rtm_table; cfg->fc_protocol = rtm->rtm_protocol; cfg->fc_scope = rtm->rtm_scope; cfg->fc_type = rtm->rtm_type; cfg->fc_flags = rtm->rtm_flags; cfg->fc_nlflags = nlh->nlmsg_flags; cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = net; if (cfg->fc_type > RTN_MAX) { NL_SET_ERR_MSG(extack, "Invalid route type"); err = -EINVAL; goto errout; } nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) { switch (nla_type(attr)) { case RTA_DST: cfg->fc_dst = nla_get_be32(attr); break; case RTA_OIF: cfg->fc_oif = nla_get_u32(attr); break; case RTA_GATEWAY: has_gw = true; cfg->fc_gw4 = nla_get_be32(attr); if (cfg->fc_gw4) cfg->fc_gw_family = AF_INET; break; case RTA_VIA: has_via = true; err = fib_gw_from_via(cfg, attr, extack); if (err) goto errout; break; case RTA_PRIORITY: cfg->fc_priority = nla_get_u32(attr); break; case RTA_PREFSRC: cfg->fc_prefsrc = nla_get_be32(attr); break; case RTA_METRICS: cfg->fc_mx = nla_data(attr); cfg->fc_mx_len = nla_len(attr); break; case RTA_MULTIPATH: err = lwtunnel_valid_encap_type_attr(nla_data(attr), nla_len(attr), extack); if (err < 0) goto errout; cfg->fc_mp = nla_data(attr); cfg->fc_mp_len = nla_len(attr); break; case RTA_FLOW: cfg->fc_flow = nla_get_u32(attr); break; case RTA_TABLE: cfg->fc_table = nla_get_u32(attr); break; case RTA_ENCAP: cfg->fc_encap = attr; break; case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); if (err < 0) goto errout; break; case RTA_NH_ID: cfg->fc_nh_id = nla_get_u32(attr); break; } } if (cfg->fc_nh_id) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_encap || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); return -EINVAL; } } if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); return -EINVAL; } if (!cfg->fc_table) cfg->fc_table = RT_TABLE_MAIN; return 0; errout: return err; } static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); err = -EINVAL; goto errout; } tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); err = -ESRCH; goto errout; } err = fib_table_delete(net, tb, &cfg, extack); errout: return err; } static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; tb = fib_new_table(net, cfg.fc_table); if (!tb) { err = -ENOBUFS; goto errout; } err = fib_table_insert(net, tb, &cfg, extack); if (!err && cfg.fc_type == RTN_LOCAL) net->ipv4.fib_has_custom_local_routes = true; errout: return err; } int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[RTA_MAX + 1]; struct rtmsg *rtm; int err, i; if (filter->rtnl_held) ASSERT_RTNL(); if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); return -EINVAL; } rtm = nlmsg_data(nlh); if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_scope) { NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); return -EINVAL; } if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) { NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request"); return -EINVAL; } if (rtm->rtm_flags & RTM_F_CLONED) filter->dump_routes = false; else filter->dump_exceptions = false; filter->flags = rtm->rtm_flags; filter->protocol = rtm->rtm_protocol; filter->rt_type = rtm->rtm_type; filter->table_id = rtm->rtm_table; err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err < 0) return err; for (i = 0; i <= RTA_MAX; ++i) { int ifindex; if (!tb[i]) continue; switch (i) { case RTA_TABLE: filter->table_id = nla_get_u32(tb[i]); break; case RTA_OIF: ifindex = nla_get_u32(tb[i]); if (filter->rtnl_held) filter->dev = __dev_get_by_index(net, ifindex); else filter->dev = dev_get_by_index_rcu(net, ifindex); if (!filter->dev) return -ENODEV; break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); return -EINVAL; } } if (filter->flags || filter->protocol || filter->rt_type || filter->table_id || filter->dev) { filter->filter_set = 1; cb->answer_flags = NLM_F_DUMP_FILTERED; } return 0; } EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req); static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct fib_dump_filter filter = { .dump_routes = true, .dump_exceptions = true, .rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_head *head; int dumped = 0, err = 0; rcu_read_lock(); if (cb->strict_check) { err = ip_valid_fib_dump_req(net, nlh, &filter, cb); if (err < 0) goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED); } /* ipv4 does not use prefix flag */ if (filter.flags & RTM_F_PREFIX) goto unlock; if (filter.table_id) { tb = fib_get_table(net, filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET) goto unlock; NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist"); err = -ENOENT; goto unlock; } err = fib_table_dump(tb, skb, cb, &filter); goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; err = 0; for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist) { if (e < s_e) goto next; if (dumped) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); err = fib_table_dump(tb, skb, cb, &filter); if (err < 0) goto out; dumped = 1; next: e++; } } out: cb->args[1] = e; cb->args[0] = h; unlock: rcu_read_unlock(); return err; } /* Prepare and feed intra-kernel routing request. * Really, it should be netlink message, but :-( netlink * can be not configured, so that we feed it directly * to fib engine. It is legal, because all events occur * only when netlink is already locked. */ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa, u32 rt_priority) { struct net *net = dev_net(ifa->ifa_dev->dev); u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, .fc_type = type, .fc_dst = dst, .fc_dst_len = dst_len, .fc_priority = rt_priority, .fc_prefsrc = ifa->ifa_local, .fc_oif = ifa->ifa_dev->dev->ifindex, .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND, .fc_nlinfo = { .nl_net = net, }, }; if (!tb_id) tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL; tb = fib_new_table(net, tb_id); if (!tb) return; cfg.fc_table = tb->tb_id; if (type != RTN_LOCAL) cfg.fc_scope = RT_SCOPE_LINK; else cfg.fc_scope = RT_SCOPE_HOST; if (cmd == RTM_NEWROUTE) fib_table_insert(net, tb, &cfg, NULL); else fib_table_delete(net, tb, &cfg, NULL); } void fib_add_ifaddr(struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *prim = ifa; __be32 mask = ifa->ifa_mask; __be32 addr = ifa->ifa_local; __be32 prefix = ifa->ifa_address & mask; if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, prefix, mask); if (!prim) { pr_warn("%s: bug: prim == NULL\n", __func__); return; } } fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0); if (!(dev->flags & IFF_UP)) return; /* Add broadcast address, if it is explicitly assigned. */ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); arp_invalidate(dev, ifa->ifa_broadcast, false); } if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_NEWROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim, ifa->ifa_rt_priority); /* Add the network broadcast address, when it makes sense */ if (ifa->ifa_prefixlen < 31) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 32, prim, 0); arp_invalidate(dev, prefix | ~mask, false); } } } void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric) { __be32 prefix = ifa->ifa_address & ifa->ifa_mask; struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; if (!(dev->flags & IFF_UP) || ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) || ipv4_is_zeronet(prefix) || (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32)) return; /* add the new */ fib_magic(RTM_NEWROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, ifa, new_metric); /* delete the old */ fib_magic(RTM_DELROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority); } /* Delete primary or secondary address. * Optionally, on secondary address promotion consider the addresses * from subnet iprim as deleted, even if they are in device list. * In this case the secondary ifa can be in device list. */ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *ifa1; struct in_ifaddr *prim = ifa, *prim1 = NULL; __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; __be32 any = ifa->ifa_address & ifa->ifa_mask; #define LOCAL_OK 1 #define BRD_OK 2 #define BRD0_OK 4 #define BRD1_OK 8 unsigned int ok = 0; int subnet = 0; /* Primary network */ int gone = 1; /* Address is missing */ int same_prefsrc = 0; /* Another primary with same IP */ if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (!prim) { /* if the device has been deleted, we don't perform * address promotion */ if (!in_dev->dead) pr_warn("%s: bug: prim == NULL\n", __func__); return; } if (iprim && iprim != prim) { pr_warn("%s: bug: iprim != prim\n", __func__); return; } } else if (!ipv4_is_zeronet(any) && (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_DELROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, any, ifa->ifa_prefixlen, prim, 0); subnet = 1; } if (in_dev->dead) goto no_promotions; /* Deletion is more complicated than add. * We should take care of not to delete too much :-) * * Scan address list to be sure that addresses are really gone. */ rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa1, in_dev) { if (ifa1 == ifa) { /* promotion, keep the IP */ gone = 0; continue; } /* Ignore IFAs from our subnet */ if (iprim && ifa1->ifa_mask == iprim->ifa_mask && inet_ifa_match(ifa1->ifa_address, iprim)) continue; /* Ignore ifa1 if it uses different primary IP (prefsrc) */ if (ifa1->ifa_flags & IFA_F_SECONDARY) { /* Another address from our subnet? */ if (ifa1->ifa_mask == prim->ifa_mask && inet_ifa_match(ifa1->ifa_address, prim)) prim1 = prim; else { /* We reached the secondaries, so * same_prefsrc should be determined. */ if (!same_prefsrc) continue; /* Search new prim1 if ifa1 is not * using the current prim1 */ if (!prim1 || ifa1->ifa_mask != prim1->ifa_mask || !inet_ifa_match(ifa1->ifa_address, prim1)) prim1 = inet_ifa_byprefix(in_dev, ifa1->ifa_address, ifa1->ifa_mask); if (!prim1) continue; if (prim1->ifa_local != prim->ifa_local) continue; } } else { if (prim->ifa_local != ifa1->ifa_local) continue; prim1 = ifa1; if (prim != prim1) same_prefsrc = 1; } if (ifa->ifa_local == ifa1->ifa_local) ok |= LOCAL_OK; if (ifa->ifa_broadcast == ifa1->ifa_broadcast) ok |= BRD_OK; if (brd == ifa1->ifa_broadcast) ok |= BRD1_OK; if (any == ifa1->ifa_broadcast) ok |= BRD0_OK; /* primary has network specific broadcasts */ if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; if (!ipv4_is_zeronet(any1)) { if (ifa->ifa_broadcast == brd1 || ifa->ifa_broadcast == any1) ok |= BRD_OK; if (brd == brd1 || brd == any1) ok |= BRD1_OK; if (any == brd1 || any == any1) ok |= BRD0_OK; } } } rcu_read_unlock(); no_promotions: if (!(ok & BRD_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); if (subnet && ifa->ifa_prefixlen < 31) { if (!(ok & BRD1_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim, 0); if (!(ok & BRD0_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim, 0); } if (!(ok & LOCAL_OK)) { unsigned int addr_type; fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0); /* Check, that this local address finally disappeared. */ addr_type = inet_addr_type_dev_table(dev_net(dev), dev, ifa->ifa_local); if (gone && addr_type != RTN_LOCAL) { /* And the last, but not the least thing. * We must flush stray FIB entries. * * First of all, we scan fib_info list searching * for stray nexthop entries, then ignite fib_flush. */ if (fib_sync_down_addr(dev, ifa->ifa_local)) fib_flush(dev_net(dev)); } } #undef LOCAL_OK #undef BRD_OK #undef BRD0_OK #undef BRD1_OK } static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) { struct fib_result res; struct flowi4 fl4 = { .flowi4_mark = frn->fl_mark, .daddr = frn->fl_addr, .flowi4_tos = frn->fl_tos, .flowi4_scope = frn->fl_scope, }; struct fib_table *tb; rcu_read_lock(); tb = fib_get_table(net, frn->tb_id_in); frn->err = -ENOENT; if (tb) { local_bh_disable(); frn->tb_id = tb->tb_id; frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); if (!frn->err) { frn->prefixlen = res.prefixlen; frn->nh_sel = res.nh_sel; frn->type = res.type; frn->scope = res.scope; } local_bh_enable(); } rcu_read_unlock(); } static void nl_fib_input(struct sk_buff *skb) { struct net *net; struct fib_result_nl *frn; struct nlmsghdr *nlh; u32 portid; net = sock_net(skb->sk); nlh = nlmsg_hdr(skb); if (skb->len < nlmsg_total_size(sizeof(*frn)) || skb->len < nlh->nlmsg_len || nlmsg_len(nlh) < sizeof(*frn)) return; skb = netlink_skb_clone(skb, GFP_KERNEL); if (!skb) return; nlh = nlmsg_hdr(skb); frn = nlmsg_data(nlh); nl_fib_lookup(net, frn); portid = NETLINK_CB(skb).portid; /* netlink portid */ NETLINK_CB(skb).portid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ nlmsg_unicast(net->ipv4.fibnl, skb, portid); } static int __net_init nl_fib_lookup_init(struct net *net) { struct sock *sk; struct netlink_kernel_cfg cfg = { .input = nl_fib_input, }; sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg); if (!sk) return -EAFNOSUPPORT; net->ipv4.fibnl = sk; return 0; } static void nl_fib_lookup_exit(struct net *net) { netlink_kernel_release(net->ipv4.fibnl); net->ipv4.fibnl = NULL; } static void fib_disable_ip(struct net_device *dev, unsigned long event, bool force) { if (fib_sync_down_dev(dev, event, force)) fib_flush(dev_net(dev)); else rt_cache_flush(dev_net(dev)); arp_ifdown(dev); } static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = ptr; struct net_device *dev = ifa->ifa_dev->dev; struct net *net = dev_net(dev); switch (event) { case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev)); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); atomic_inc(&net->ipv4.dev_addr_genid); if (!ifa->ifa_dev->ifa_list) { /* Last address was deleted from this interface. * Disable IP. */ fib_disable_ip(dev, event, true); } else { rt_cache_flush(dev_net(dev)); } break; } return NOTIFY_DONE; } static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_changeupper_info *upper_info = ptr; struct netdev_notifier_info_ext *info_ext = ptr; struct in_device *in_dev; struct net *net = dev_net(dev); struct in_ifaddr *ifa; unsigned int flags; if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, event, true); rt_flush_dev(dev); return NOTIFY_DONE; } in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return NOTIFY_DONE; switch (event) { case NETDEV_UP: in_dev_for_each_ifa_rtnl(ifa, in_dev) { fib_add_ifaddr(ifa); } #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: fib_disable_ip(dev, event, false); break; case NETDEV_CHANGE: flags = dev_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) fib_sync_up(dev, RTNH_F_LINKDOWN); else fib_sync_down_dev(dev, event, false); rt_cache_flush(net); break; case NETDEV_CHANGEMTU: fib_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(net); break; case NETDEV_CHANGEUPPER: upper_info = ptr; /* flush all routes if dev is linked to or unlinked from * an L3 master device (e.g., VRF) */ if (upper_info->upper_dev && netif_is_l3_master(upper_info->upper_dev)) fib_disable_ip(dev, NETDEV_DOWN, true); break; } return NOTIFY_DONE; } static struct notifier_block fib_inetaddr_notifier = { .notifier_call = fib_inetaddr_event, }; static struct notifier_block fib_netdev_notifier = { .notifier_call = fib_netdev_event, }; static int __net_init ip_fib_net_init(struct net *net) { int err; size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; err = fib4_notifier_init(net); if (err) return err; #ifdef CONFIG_IP_ROUTE_MULTIPATH /* Default to 3-tuple */ net->ipv4.sysctl_fib_multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; #endif /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv4.fib_table_hash) { err = -ENOMEM; goto err_table_hash_alloc; } err = fib4_rules_init(net); if (err < 0) goto err_rules_init; return 0; err_rules_init: kfree(net->ipv4.fib_table_hash); err_table_hash_alloc: fib4_notifier_exit(net); return err; } static void ip_fib_net_exit(struct net *net) { int i; ASSERT_RTNL(); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif /* Destroy the tables in reverse order to guarantee that the * local table, ID 255, is destroyed before the main table, ID * 254. This is necessary as the local table may contain * references to data contained in the main table. */ for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) { struct hlist_head *head = &net->ipv4.fib_table_hash[i]; struct hlist_node *tmp; struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_del(&tb->tb_hlist); fib_table_flush(net, tb, true); fib_free_table(tb); } } #ifdef CONFIG_IP_MULTIPLE_TABLES fib4_rules_exit(net); #endif kfree(net->ipv4.fib_table_hash); fib4_notifier_exit(net); } static int __net_init fib_net_init(struct net *net) { int error; #ifdef CONFIG_IP_ROUTE_CLASSID atomic_set(&net->ipv4.fib_num_tclassid_users, 0); #endif error = ip_fib_net_init(net); if (error < 0) goto out; error = nl_fib_lookup_init(net); if (error < 0) goto out_nlfl; error = fib_proc_init(net); if (error < 0) goto out_proc; out: return error; out_proc: nl_fib_lookup_exit(net); out_nlfl: rtnl_lock(); ip_fib_net_exit(net); rtnl_unlock(); goto out; } static void __net_exit fib_net_exit(struct net *net) { fib_proc_exit(net); nl_fib_lookup_exit(net); } static void __net_exit fib_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) ip_fib_net_exit(net); rtnl_unlock(); } static struct pernet_operations fib_net_ops = { .init = fib_net_init, .exit = fib_net_exit, .exit_batch = fib_net_exit_batch, }; void __init ip_fib_init(void) { fib_trie_init(); register_pernet_subsys(&fib_net_ops); register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0); rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0); rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE); }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 // SPDX-License-Identifier: GPL-2.0 /* USB Driver layer for GSM modems Copyright (C) 2005 Matthias Urlichs <smurf@smurf.noris.de> Portions copied from the Keyspan driver by Hugh Blemings <hugh@blemings.org> History: see the git log. Work sponsored by: Sigos GmbH, Germany <info@sigos.de> This driver exists because the "normal" serial driver doesn't work too well with GSM modems. Issues: - data loss -- one single Receive URB is not nearly enough - controlling the baud rate doesn't make sense */ #define DRIVER_AUTHOR "Matthias Urlichs <smurf@smurf.noris.de>" #define DRIVER_DESC "USB Driver for GSM modems" #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/uaccess.h> #include <linux/usb.h> #include <linux/usb/cdc.h> #include <linux/usb/serial.h> #include <linux/serial.h> #include "usb-wwan.h" /* * Generate DTR/RTS signals on the port using the SET_CONTROL_LINE_STATE request * in CDC ACM. */ static int usb_wwan_send_setup(struct usb_serial_port *port) { struct usb_serial *serial = port->serial; struct usb_wwan_port_private *portdata; int val = 0; int ifnum; int res; portdata = usb_get_serial_port_data(port); if (portdata->dtr_state) val |= USB_CDC_CTRL_DTR; if (portdata->rts_state) val |= USB_CDC_CTRL_RTS; ifnum = serial->interface->cur_altsetting->desc.bInterfaceNumber; res = usb_autopm_get_interface(serial->interface); if (res) return res; res = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), USB_CDC_REQ_SET_CONTROL_LINE_STATE, USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE, val, ifnum, NULL, 0, USB_CTRL_SET_TIMEOUT); usb_autopm_put_interface(port->serial->interface); return res; } void usb_wwan_dtr_rts(struct usb_serial_port *port, int on) { struct usb_wwan_port_private *portdata; struct usb_wwan_intf_private *intfdata; intfdata = usb_get_serial_data(port->serial); if (!intfdata->use_send_setup) return; portdata = usb_get_serial_port_data(port); /* FIXME: locking */ portdata->rts_state = on; portdata->dtr_state = on; usb_wwan_send_setup(port); } EXPORT_SYMBOL(usb_wwan_dtr_rts); int usb_wwan_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; unsigned int value; struct usb_wwan_port_private *portdata; portdata = usb_get_serial_port_data(port); value = ((portdata->rts_state) ? TIOCM_RTS : 0) | ((portdata->dtr_state) ? TIOCM_DTR : 0) | ((portdata->cts_state) ? TIOCM_CTS : 0) | ((portdata->dsr_state) ? TIOCM_DSR : 0) | ((portdata->dcd_state) ? TIOCM_CAR : 0) | ((portdata->ri_state) ? TIOCM_RNG : 0); return value; } EXPORT_SYMBOL(usb_wwan_tiocmget); int usb_wwan_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; struct usb_wwan_port_private *portdata; struct usb_wwan_intf_private *intfdata; portdata = usb_get_serial_port_data(port); intfdata = usb_get_serial_data(port->serial); if (!intfdata->use_send_setup) return -EINVAL; /* FIXME: what locks portdata fields ? */ if (set & TIOCM_RTS) portdata->rts_state = 1; if (set & TIOCM_DTR) portdata->dtr_state = 1; if (clear & TIOCM_RTS) portdata->rts_state = 0; if (clear & TIOCM_DTR) portdata->dtr_state = 0; return usb_wwan_send_setup(port); } EXPORT_SYMBOL(usb_wwan_tiocmset); int usb_wwan_write(struct tty_struct *tty, struct usb_serial_port *port, const unsigned char *buf, int count) { struct usb_wwan_port_private *portdata; struct usb_wwan_intf_private *intfdata; int i; int left, todo; struct urb *this_urb = NULL; /* spurious */ int err; unsigned long flags; portdata = usb_get_serial_port_data(port); intfdata = usb_get_serial_data(port->serial); dev_dbg(&port->dev, "%s: write (%d chars)\n", __func__, count); left = count; for (i = 0; left > 0 && i < N_OUT_URB; i++) { todo = left; if (todo > OUT_BUFLEN) todo = OUT_BUFLEN; this_urb = portdata->out_urbs[i]; if (test_and_set_bit(i, &portdata->out_busy)) { if (time_before(jiffies, portdata->tx_start_time[i] + 10 * HZ)) continue; usb_unlink_urb(this_urb); continue; } dev_dbg(&port->dev, "%s: endpoint %d buf %d\n", __func__, usb_pipeendpoint(this_urb->pipe), i); err = usb_autopm_get_interface_async(port->serial->interface); if (err < 0) { clear_bit(i, &portdata->out_busy); break; } /* send the data */ memcpy(this_urb->transfer_buffer, buf, todo); this_urb->transfer_buffer_length = todo; spin_lock_irqsave(&intfdata->susp_lock, flags); if (intfdata->suspended) { usb_anchor_urb(this_urb, &portdata->delayed); spin_unlock_irqrestore(&intfdata->susp_lock, flags); } else { intfdata->in_flight++; spin_unlock_irqrestore(&intfdata->susp_lock, flags); err = usb_submit_urb(this_urb, GFP_ATOMIC); if (err) { dev_err(&port->dev, "%s: submit urb %d failed: %d\n", __func__, i, err); clear_bit(i, &portdata->out_busy); spin_lock_irqsave(&intfdata->susp_lock, flags); intfdata->in_flight--; spin_unlock_irqrestore(&intfdata->susp_lock, flags); usb_autopm_put_interface_async(port->serial->interface); break; } } portdata->tx_start_time[i] = jiffies; buf += todo; left -= todo; } count -= left; dev_dbg(&port->dev, "%s: wrote (did %d)\n", __func__, count); return count; } EXPORT_SYMBOL(usb_wwan_write); static void usb_wwan_indat_callback(struct urb *urb) { int err; int endpoint; struct usb_serial_port *port; struct device *dev; unsigned char *data = urb->transfer_buffer; int status = urb->status; endpoint = usb_pipeendpoint(urb->pipe); port = urb->context; dev = &port->dev; if (status) { dev_dbg(dev, "%s: nonzero status: %d on endpoint %02x.\n", __func__, status, endpoint); /* don't resubmit on fatal errors */ if (status == -ESHUTDOWN || status == -ENOENT) return; } else { if (urb->actual_length) { tty_insert_flip_string(&port->port, data, urb->actual_length); tty_flip_buffer_push(&port->port); } else dev_dbg(dev, "%s: empty read urb received\n", __func__); } /* Resubmit urb so we continue receiving */ err = usb_submit_urb(urb, GFP_ATOMIC); if (err) { if (err != -EPERM && err != -ENODEV) { dev_err(dev, "%s: resubmit read urb failed. (%d)\n", __func__, err); /* busy also in error unless we are killed */ usb_mark_last_busy(port->serial->dev); } } else { usb_mark_last_busy(port->serial->dev); } } static void usb_wwan_outdat_callback(struct urb *urb) { struct usb_serial_port *port; struct usb_wwan_port_private *portdata; struct usb_wwan_intf_private *intfdata; unsigned long flags; int i; port = urb->context; intfdata = usb_get_serial_data(port->serial); usb_serial_port_softint(port); usb_autopm_put_interface_async(port->serial->interface); portdata = usb_get_serial_port_data(port); spin_lock_irqsave(&intfdata->susp_lock, flags); intfdata->in_flight--; spin_unlock_irqrestore(&intfdata->susp_lock, flags); for (i = 0; i < N_OUT_URB; ++i) { if (portdata->out_urbs[i] == urb) { smp_mb__before_atomic(); clear_bit(i, &portdata->out_busy); break; } } } unsigned int usb_wwan_write_room(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct usb_wwan_port_private *portdata; int i; unsigned int data_len = 0; struct urb *this_urb; portdata = usb_get_serial_port_data(port); for (i = 0; i < N_OUT_URB; i++) { this_urb = portdata->out_urbs[i]; if (this_urb && !test_bit(i, &portdata->out_busy)) data_len += OUT_BUFLEN; } dev_dbg(&port->dev, "%s: %u\n", __func__, data_len); return data_len; } EXPORT_SYMBOL(usb_wwan_write_room); unsigned int usb_wwan_chars_in_buffer(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct usb_wwan_port_private *portdata; int i; unsigned int data_len = 0; struct urb *this_urb; portdata = usb_get_serial_port_data(port); for (i = 0; i < N_OUT_URB; i++) { this_urb = portdata->out_urbs[i]; /* FIXME: This locking is insufficient as this_urb may go unused during the test */ if (this_urb && test_bit(i, &portdata->out_busy)) data_len += this_urb->transfer_buffer_length; } dev_dbg(&port->dev, "%s: %u\n", __func__, data_len); return data_len; } EXPORT_SYMBOL(usb_wwan_chars_in_buffer); int usb_wwan_open(struct tty_struct *tty, struct usb_serial_port *port) { struct usb_wwan_port_private *portdata; struct usb_wwan_intf_private *intfdata; struct usb_serial *serial = port->serial; int i, err; struct urb *urb; portdata = usb_get_serial_port_data(port); intfdata = usb_get_serial_data(serial); if (port->interrupt_in_urb) { err = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL); if (err) { dev_err(&port->dev, "%s: submit int urb failed: %d\n", __func__, err); } } /* Start reading from the IN endpoint */ for (i = 0; i < N_IN_URB; i++) { urb = portdata->in_urbs[i]; if (!urb) continue; err = usb_submit_urb(urb, GFP_KERNEL); if (err) { dev_err(&port->dev, "%s: submit read urb %d failed: %d\n", __func__, i, err); } } spin_lock_irq(&intfdata->susp_lock); if (++intfdata->open_ports == 1) serial->interface->needs_remote_wakeup = 1; spin_unlock_irq(&intfdata->susp_lock); /* this balances a get in the generic USB serial code */ usb_autopm_put_interface(serial->interface); return 0; } EXPORT_SYMBOL(usb_wwan_open); static void unbusy_queued_urb(struct urb *urb, struct usb_wwan_port_private *portdata) { int i; for (i = 0; i < N_OUT_URB; i++) { if (urb == portdata->out_urbs[i]) { clear_bit(i, &portdata->out_busy); break; } } } void usb_wwan_close(struct usb_serial_port *port) { int i; struct usb_serial *serial = port->serial; struct usb_wwan_port_private *portdata; struct usb_wwan_intf_private *intfdata = usb_get_serial_data(serial); struct urb *urb; portdata = usb_get_serial_port_data(port); /* * Need to take susp_lock to make sure port is not already being * resumed, but no need to hold it due to the tty-port initialized * flag. */ spin_lock_irq(&intfdata->susp_lock); if (--intfdata->open_ports == 0) serial->interface->needs_remote_wakeup = 0; spin_unlock_irq(&intfdata->susp_lock); for (;;) { urb = usb_get_from_anchor(&portdata->delayed); if (!urb) break; unbusy_queued_urb(urb, portdata); usb_autopm_put_interface_async(serial->interface); } for (i = 0; i < N_IN_URB; i++) usb_kill_urb(portdata->in_urbs[i]); for (i = 0; i < N_OUT_URB; i++) usb_kill_urb(portdata->out_urbs[i]); usb_kill_urb(port->interrupt_in_urb); usb_autopm_get_interface_no_resume(serial->interface); } EXPORT_SYMBOL(usb_wwan_close); static struct urb *usb_wwan_setup_urb(struct usb_serial_port *port, int endpoint, int dir, void *ctx, char *buf, int len, void (*callback) (struct urb *)) { struct usb_serial *serial = port->serial; struct usb_wwan_intf_private *intfdata = usb_get_serial_data(serial); struct urb *urb; urb = usb_alloc_urb(0, GFP_KERNEL); /* No ISO */ if (!urb) return NULL; usb_fill_bulk_urb(urb, serial->dev, usb_sndbulkpipe(serial->dev, endpoint) | dir, buf, len, callback, ctx); if (intfdata->use_zlp && dir == USB_DIR_OUT) urb->transfer_flags |= URB_ZERO_PACKET; return urb; } int usb_wwan_port_probe(struct usb_serial_port *port) { struct usb_wwan_port_private *portdata; struct urb *urb; u8 *buffer; int i; if (!port->bulk_in_size || !port->bulk_out_size) return -ENODEV; portdata = kzalloc(sizeof(*portdata), GFP_KERNEL); if (!portdata) return -ENOMEM; init_usb_anchor(&portdata->delayed); for (i = 0; i < N_IN_URB; i++) { buffer = (u8 *)__get_free_page(GFP_KERNEL); if (!buffer) goto bail_out_error; portdata->in_buffer[i] = buffer; urb = usb_wwan_setup_urb(port, port->bulk_in_endpointAddress, USB_DIR_IN, port, buffer, IN_BUFLEN, usb_wwan_indat_callback); portdata->in_urbs[i] = urb; } for (i = 0; i < N_OUT_URB; i++) { buffer = kmalloc(OUT_BUFLEN, GFP_KERNEL); if (!buffer) goto bail_out_error2; portdata->out_buffer[i] = buffer; urb = usb_wwan_setup_urb(port, port->bulk_out_endpointAddress, USB_DIR_OUT, port, buffer, OUT_BUFLEN, usb_wwan_outdat_callback); portdata->out_urbs[i] = urb; } usb_set_serial_port_data(port, portdata); return 0; bail_out_error2: for (i = 0; i < N_OUT_URB; i++) { usb_free_urb(portdata->out_urbs[i]); kfree(portdata->out_buffer[i]); } bail_out_error: for (i = 0; i < N_IN_URB; i++) { usb_free_urb(portdata->in_urbs[i]); free_page((unsigned long)portdata->in_buffer[i]); } kfree(portdata); return -ENOMEM; } EXPORT_SYMBOL_GPL(usb_wwan_port_probe); void usb_wwan_port_remove(struct usb_serial_port *port) { int i; struct usb_wwan_port_private *portdata; portdata = usb_get_serial_port_data(port); usb_set_serial_port_data(port, NULL); for (i = 0; i < N_IN_URB; i++) { usb_free_urb(portdata->in_urbs[i]); free_page((unsigned long)portdata->in_buffer[i]); } for (i = 0; i < N_OUT_URB; i++) { usb_free_urb(portdata->out_urbs[i]); kfree(portdata->out_buffer[i]); } kfree(portdata); } EXPORT_SYMBOL(usb_wwan_port_remove); #ifdef CONFIG_PM static void stop_urbs(struct usb_serial *serial) { int i, j; struct usb_serial_port *port; struct usb_wwan_port_private *portdata; for (i = 0; i < serial->num_ports; ++i) { port = serial->port[i]; portdata = usb_get_serial_port_data(port); if (!portdata) continue; for (j = 0; j < N_IN_URB; j++) usb_kill_urb(portdata->in_urbs[j]); for (j = 0; j < N_OUT_URB; j++) usb_kill_urb(portdata->out_urbs[j]); usb_kill_urb(port->interrupt_in_urb); } } int usb_wwan_suspend(struct usb_serial *serial, pm_message_t message) { struct usb_wwan_intf_private *intfdata = usb_get_serial_data(serial); spin_lock_irq(&intfdata->susp_lock); if (PMSG_IS_AUTO(message)) { if (intfdata->in_flight) { spin_unlock_irq(&intfdata->susp_lock); return -EBUSY; } } intfdata->suspended = 1; spin_unlock_irq(&intfdata->susp_lock); stop_urbs(serial); return 0; } EXPORT_SYMBOL(usb_wwan_suspend); /* Caller must hold susp_lock. */ static int usb_wwan_submit_delayed_urbs(struct usb_serial_port *port) { struct usb_serial *serial = port->serial; struct usb_wwan_intf_private *data = usb_get_serial_data(serial); struct usb_wwan_port_private *portdata; struct urb *urb; int err_count = 0; int err; portdata = usb_get_serial_port_data(port); for (;;) { urb = usb_get_from_anchor(&portdata->delayed); if (!urb) break; err = usb_submit_urb(urb, GFP_ATOMIC); if (err) { dev_err(&port->dev, "%s: submit urb failed: %d\n", __func__, err); err_count++; unbusy_queued_urb(urb, portdata); usb_autopm_put_interface_async(serial->interface); continue; } data->in_flight++; } if (err_count) return -EIO; return 0; } int usb_wwan_resume(struct usb_serial *serial) { int i, j; struct usb_serial_port *port; struct usb_wwan_intf_private *intfdata = usb_get_serial_data(serial); struct usb_wwan_port_private *portdata; struct urb *urb; int err; int err_count = 0; spin_lock_irq(&intfdata->susp_lock); for (i = 0; i < serial->num_ports; i++) { port = serial->port[i]; if (!tty_port_initialized(&port->port)) continue; portdata = usb_get_serial_port_data(port); if (port->interrupt_in_urb) { err = usb_submit_urb(port->interrupt_in_urb, GFP_ATOMIC); if (err) { dev_err(&port->dev, "%s: submit int urb failed: %d\n", __func__, err); err_count++; } } err = usb_wwan_submit_delayed_urbs(port); if (err) err_count++; for (j = 0; j < N_IN_URB; j++) { urb = portdata->in_urbs[j]; err = usb_submit_urb(urb, GFP_ATOMIC); if (err < 0) { dev_err(&port->dev, "%s: submit read urb %d failed: %d\n", __func__, i, err); err_count++; } } } intfdata->suspended = 0; spin_unlock_irq(&intfdata->susp_lock); if (err_count) return -EIO; return 0; } EXPORT_SYMBOL(usb_wwan_resume); #endif MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL v2");
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 // SPDX-License-Identifier: GPL-2.0-only /* * Line 6 Linux USB driver * * Copyright (C) 2004-2010 Markus Grabner (grabner@icg.tugraz.at) */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/usb.h> #include <sound/core.h> #include <sound/initval.h> #include <sound/hwdep.h> #include "capture.h" #include "driver.h" #include "midi.h" #include "playback.h" #define DRIVER_AUTHOR "Markus Grabner <grabner@icg.tugraz.at>" #define DRIVER_DESC "Line 6 USB Driver" /* This is Line 6's MIDI manufacturer ID. */ const unsigned char line6_midi_id[3] = { 0x00, 0x01, 0x0c }; EXPORT_SYMBOL_GPL(line6_midi_id); /* Code to request version of POD, Variax interface (and maybe other devices). */ static const char line6_request_version[] = { 0xf0, 0x7e, 0x7f, 0x06, 0x01, 0xf7 }; /* Class for asynchronous messages. */ struct message { struct usb_line6 *line6; const char *buffer; int size; int done; }; /* Forward declarations. */ static void line6_data_received(struct urb *urb); static int line6_send_raw_message_async_part(struct message *msg, struct urb *urb); /* Start to listen on endpoint. */ static int line6_start_listen(struct usb_line6 *line6) { int err; if (line6->properties->capabilities & LINE6_CAP_CONTROL_MIDI) { usb_fill_int_urb(line6->urb_listen, line6->usbdev, usb_rcvintpipe(line6->usbdev, line6->properties->ep_ctrl_r), line6->buffer_listen, LINE6_BUFSIZE_LISTEN, line6_data_received, line6, line6->interval); } else { usb_fill_bulk_urb(line6->urb_listen, line6->usbdev, usb_rcvbulkpipe(line6->usbdev, line6->properties->ep_ctrl_r), line6->buffer_listen, LINE6_BUFSIZE_LISTEN, line6_data_received, line6); } /* sanity checks of EP before actually submitting */ if (usb_urb_ep_type_check(line6->urb_listen)) { dev_err(line6->ifcdev, "invalid control EP\n"); return -EINVAL; } line6->urb_listen->actual_length = 0; err = usb_submit_urb(line6->urb_listen, GFP_ATOMIC); return err; } /* Stop listening on endpoint. */ static void line6_stop_listen(struct usb_line6 *line6) { usb_kill_urb(line6->urb_listen); } /* Send raw message in pieces of wMaxPacketSize bytes. */ int line6_send_raw_message(struct usb_line6 *line6, const char *buffer, int size) { int i, done = 0; const struct line6_properties *properties = line6->properties; for (i = 0; i < size; i += line6->max_packet_size) { int partial; const char *frag_buf = buffer + i; int frag_size = min(line6->max_packet_size, size - i); int retval; if (properties->capabilities & LINE6_CAP_CONTROL_MIDI) { retval = usb_interrupt_msg(line6->usbdev, usb_sndintpipe(line6->usbdev, properties->ep_ctrl_w), (char *)frag_buf, frag_size, &partial, LINE6_TIMEOUT); } else { retval = usb_bulk_msg(line6->usbdev, usb_sndbulkpipe(line6->usbdev, properties->ep_ctrl_w), (char *)frag_buf, frag_size, &partial, LINE6_TIMEOUT); } if (retval) { dev_err(line6->ifcdev, "usb_bulk_msg failed (%d)\n", retval); break; } done += frag_size; } return done; } EXPORT_SYMBOL_GPL(line6_send_raw_message); /* Notification of completion of asynchronous request transmission. */ static void line6_async_request_sent(struct urb *urb) { struct message *msg = (struct message *)urb->context; if (msg->done >= msg->size) { usb_free_urb(urb); kfree(msg); } else line6_send_raw_message_async_part(msg, urb); } /* Asynchronously send part of a raw message. */ static int line6_send_raw_message_async_part(struct message *msg, struct urb *urb) { int retval; struct usb_line6 *line6 = msg->line6; int done = msg->done; int bytes = min(msg->size - done, line6->max_packet_size); if (line6->properties->capabilities & LINE6_CAP_CONTROL_MIDI) { usb_fill_int_urb(urb, line6->usbdev, usb_sndintpipe(line6->usbdev, line6->properties->ep_ctrl_w), (char *)msg->buffer + done, bytes, line6_async_request_sent, msg, line6->interval); } else { usb_fill_bulk_urb(urb, line6->usbdev, usb_sndbulkpipe(line6->usbdev, line6->properties->ep_ctrl_w), (char *)msg->buffer + done, bytes, line6_async_request_sent, msg); } msg->done += bytes; /* sanity checks of EP before actually submitting */ retval = usb_urb_ep_type_check(urb); if (retval < 0) goto error; retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval < 0) goto error; return 0; error: dev_err(line6->ifcdev, "%s: usb_submit_urb failed (%d)\n", __func__, retval); usb_free_urb(urb); kfree(msg); return retval; } /* Asynchronously send raw message. */ int line6_send_raw_message_async(struct usb_line6 *line6, const char *buffer, int size) { struct message *msg; struct urb *urb; /* create message: */ msg = kzalloc(sizeof(struct message), GFP_ATOMIC); if (msg == NULL) return -ENOMEM; /* create URB: */ urb = usb_alloc_urb(0, GFP_ATOMIC); if (urb == NULL) { kfree(msg); return -ENOMEM; } /* set message data: */ msg->line6 = line6; msg->buffer = buffer; msg->size = size; msg->done = 0; /* start sending: */ return line6_send_raw_message_async_part(msg, urb); } EXPORT_SYMBOL_GPL(line6_send_raw_message_async); /* Send asynchronous device version request. */ int line6_version_request_async(struct usb_line6 *line6) { char *buffer; int retval; buffer = kmemdup(line6_request_version, sizeof(line6_request_version), GFP_ATOMIC); if (buffer == NULL) return -ENOMEM; retval = line6_send_raw_message_async(line6, buffer, sizeof(line6_request_version)); kfree(buffer); return retval; } EXPORT_SYMBOL_GPL(line6_version_request_async); /* Send sysex message in pieces of wMaxPacketSize bytes. */ int line6_send_sysex_message(struct usb_line6 *line6, const char *buffer, int size) { return line6_send_raw_message(line6, buffer, size + SYSEX_EXTRA_SIZE) - SYSEX_EXTRA_SIZE; } EXPORT_SYMBOL_GPL(line6_send_sysex_message); /* Allocate buffer for sysex message and prepare header. @param code sysex message code @param size number of bytes between code and sysex end */ char *line6_alloc_sysex_buffer(struct usb_line6 *line6, int code1, int code2, int size) { char *buffer = kmalloc(size + SYSEX_EXTRA_SIZE, GFP_ATOMIC); if (!buffer) return NULL; buffer[0] = LINE6_SYSEX_BEGIN; memcpy(buffer + 1, line6_midi_id, sizeof(line6_midi_id)); buffer[sizeof(line6_midi_id) + 1] = code1; buffer[sizeof(line6_midi_id) + 2] = code2; buffer[sizeof(line6_midi_id) + 3 + size] = LINE6_SYSEX_END; return buffer; } EXPORT_SYMBOL_GPL(line6_alloc_sysex_buffer); /* Notification of data received from the Line 6 device. */ static void line6_data_received(struct urb *urb) { struct usb_line6 *line6 = (struct usb_line6 *)urb->context; struct midi_buffer *mb = &line6->line6midi->midibuf_in; int done; if (urb->status == -ESHUTDOWN) return; if (line6->properties->capabilities & LINE6_CAP_CONTROL_MIDI) { done = line6_midibuf_write(mb, urb->transfer_buffer, urb->actual_length); if (done < urb->actual_length) { line6_midibuf_ignore(mb, done); dev_dbg(line6->ifcdev, "%d %d buffer overflow - message skipped\n", done, urb->actual_length); } for (;;) { done = line6_midibuf_read(mb, line6->buffer_message, LINE6_MIDI_MESSAGE_MAXLEN, LINE6_MIDIBUF_READ_RX); if (done <= 0) break; line6->message_length = done; line6_midi_receive(line6, line6->buffer_message, done); if (line6->process_message) line6->process_message(line6); } } else { line6->buffer_message = urb->transfer_buffer; line6->message_length = urb->actual_length; if (line6->process_message) line6->process_message(line6); line6->buffer_message = NULL; } line6_start_listen(line6); } #define LINE6_READ_WRITE_STATUS_DELAY 2 /* milliseconds */ #define LINE6_READ_WRITE_MAX_RETRIES 50 /* Read data from device. */ int line6_read_data(struct usb_line6 *line6, unsigned address, void *data, unsigned datalen) { struct usb_device *usbdev = line6->usbdev; int ret; u8 len; unsigned count; if (address > 0xffff || datalen > 0xff) return -EINVAL; /* query the serial number: */ ret = usb_control_msg_send(usbdev, 0, 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, (datalen << 8) | 0x21, address, NULL, 0, LINE6_TIMEOUT, GFP_KERNEL); if (ret) { dev_err(line6->ifcdev, "read request failed (error %d)\n", ret); goto exit; } /* Wait for data length. We'll get 0xff until length arrives. */ for (count = 0; count < LINE6_READ_WRITE_MAX_RETRIES; count++) { mdelay(LINE6_READ_WRITE_STATUS_DELAY); ret = usb_control_msg_recv(usbdev, 0, 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x0012, 0x0000, &len, 1, LINE6_TIMEOUT, GFP_KERNEL); if (ret) { dev_err(line6->ifcdev, "receive length failed (error %d)\n", ret); goto exit; } if (len != 0xff) break; } ret = -EIO; if (len == 0xff) { dev_err(line6->ifcdev, "read failed after %d retries\n", count); goto exit; } else if (len != datalen) { /* should be equal or something went wrong */ dev_err(line6->ifcdev, "length mismatch (expected %d, got %d)\n", (int)datalen, len); goto exit; } /* receive the result: */ ret = usb_control_msg_recv(usbdev, 0, 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x0013, 0x0000, data, datalen, LINE6_TIMEOUT, GFP_KERNEL); if (ret) dev_err(line6->ifcdev, "read failed (error %d)\n", ret); exit: return ret; } EXPORT_SYMBOL_GPL(line6_read_data); /* Write data to device. */ int line6_write_data(struct usb_line6 *line6, unsigned address, void *data, unsigned datalen) { struct usb_device *usbdev = line6->usbdev; int ret; unsigned char *status; int count; if (address > 0xffff || datalen > 0xffff) return -EINVAL; status = kmalloc(1, GFP_KERNEL); if (!status) return -ENOMEM; ret = usb_control_msg_send(usbdev, 0, 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, 0x0022, address, data, datalen, LINE6_TIMEOUT, GFP_KERNEL); if (ret) { dev_err(line6->ifcdev, "write request failed (error %d)\n", ret); goto exit; } for (count = 0; count < LINE6_READ_WRITE_MAX_RETRIES; count++) { mdelay(LINE6_READ_WRITE_STATUS_DELAY); ret = usb_control_msg_recv(usbdev, 0, 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x0012, 0x0000, status, 1, LINE6_TIMEOUT, GFP_KERNEL); if (ret) { dev_err(line6->ifcdev, "receiving status failed (error %d)\n", ret); goto exit; } if (*status != 0xff) break; } if (*status == 0xff) { dev_err(line6->ifcdev, "write failed after %d retries\n", count); ret = -EIO; } else if (*status != 0) { dev_err(line6->ifcdev, "write failed (error %d)\n", ret); ret = -EIO; } exit: kfree(status); return ret; } EXPORT_SYMBOL_GPL(line6_write_data); /* Read Line 6 device serial number. (POD, TonePort, GuitarPort) */ int line6_read_serial_number(struct usb_line6 *line6, u32 *serial_number) { return line6_read_data(line6, 0x80d0, serial_number, sizeof(*serial_number)); } EXPORT_SYMBOL_GPL(line6_read_serial_number); /* Card destructor. */ static void line6_destruct(struct snd_card *card) { struct usb_line6 *line6 = card->private_data; struct usb_device *usbdev = line6->usbdev; /* Free buffer memory first. We cannot depend on the existence of private * data from the (podhd) module, it may be gone already during this call */ kfree(line6->buffer_message); kfree(line6->buffer_listen); /* then free URBs: */ usb_free_urb(line6->urb_listen); line6->urb_listen = NULL; /* decrement reference counters: */ usb_put_dev(usbdev); } static void line6_get_usb_properties(struct usb_line6 *line6) { struct usb_device *usbdev = line6->usbdev; const struct line6_properties *properties = line6->properties; int pipe; struct usb_host_endpoint *ep = NULL; if (properties->capabilities & LINE6_CAP_CONTROL) { if (properties->capabilities & LINE6_CAP_CONTROL_MIDI) { pipe = usb_rcvintpipe(line6->usbdev, line6->properties->ep_ctrl_r); } else { pipe = usb_rcvbulkpipe(line6->usbdev, line6->properties->ep_ctrl_r); } ep = usbdev->ep_in[usb_pipeendpoint(pipe)]; } /* Control data transfer properties */ if (ep) { line6->interval = ep->desc.bInterval; line6->max_packet_size = le16_to_cpu(ep->desc.wMaxPacketSize); } else { if (properties->capabilities & LINE6_CAP_CONTROL) { dev_err(line6->ifcdev, "endpoint not available, using fallback values"); } line6->interval = LINE6_FALLBACK_INTERVAL; line6->max_packet_size = LINE6_FALLBACK_MAXPACKETSIZE; } /* Isochronous transfer properties */ if (usbdev->speed == USB_SPEED_LOW) { line6->intervals_per_second = USB_LOW_INTERVALS_PER_SECOND; line6->iso_buffers = USB_LOW_ISO_BUFFERS; } else { line6->intervals_per_second = USB_HIGH_INTERVALS_PER_SECOND; line6->iso_buffers = USB_HIGH_ISO_BUFFERS; } } /* Enable buffering of incoming messages, flush the buffer */ static int line6_hwdep_open(struct snd_hwdep *hw, struct file *file) { struct usb_line6 *line6 = hw->private_data; /* NOTE: hwdep layer provides atomicity here */ line6->messages.active = 1; line6->messages.nonblock = file->f_flags & O_NONBLOCK ? 1 : 0; return 0; } /* Stop buffering */ static int line6_hwdep_release(struct snd_hwdep *hw, struct file *file) { struct usb_line6 *line6 = hw->private_data; line6->messages.active = 0; return 0; } /* Read from circular buffer, return to user */ static long line6_hwdep_read(struct snd_hwdep *hwdep, char __user *buf, long count, loff_t *offset) { struct usb_line6 *line6 = hwdep->private_data; long rv = 0; unsigned int out_count; if (mutex_lock_interruptible(&line6->messages.read_lock)) return -ERESTARTSYS; while (kfifo_len(&line6->messages.fifo) == 0) { mutex_unlock(&line6->messages.read_lock); if (line6->messages.nonblock) return -EAGAIN; rv = wait_event_interruptible( line6->messages.wait_queue, kfifo_len(&line6->messages.fifo) != 0); if (rv < 0) return rv; if (mutex_lock_interruptible(&line6->messages.read_lock)) return -ERESTARTSYS; } if (kfifo_peek_len(&line6->messages.fifo) > count) { /* Buffer too small; allow re-read of the current item... */ rv = -EINVAL; } else { rv = kfifo_to_user(&line6->messages.fifo, buf, count, &out_count); if (rv == 0) rv = out_count; } mutex_unlock(&line6->messages.read_lock); return rv; } /* Write directly (no buffering) to device by user*/ static long line6_hwdep_write(struct snd_hwdep *hwdep, const char __user *data, long count, loff_t *offset) { struct usb_line6 *line6 = hwdep->private_data; int rv; char *data_copy; if (count > line6->max_packet_size * LINE6_RAW_MESSAGES_MAXCOUNT) { /* This is an arbitrary limit - still better than nothing... */ return -EINVAL; } data_copy = memdup_user(data, count); if (IS_ERR(data_copy)) return PTR_ERR(data_copy); rv = line6_send_raw_message(line6, data_copy, count); kfree(data_copy); return rv; } static __poll_t line6_hwdep_poll(struct snd_hwdep *hwdep, struct file *file, poll_table *wait) { __poll_t rv; struct usb_line6 *line6 = hwdep->private_data; poll_wait(file, &line6->messages.wait_queue, wait); mutex_lock(&line6->messages.read_lock); rv = kfifo_len(&line6->messages.fifo) == 0 ? 0 : EPOLLIN | EPOLLRDNORM; mutex_unlock(&line6->messages.read_lock); return rv; } static const struct snd_hwdep_ops hwdep_ops = { .open = line6_hwdep_open, .release = line6_hwdep_release, .read = line6_hwdep_read, .write = line6_hwdep_write, .poll = line6_hwdep_poll, }; /* Insert into circular buffer */ static void line6_hwdep_push_message(struct usb_line6 *line6) { if (!line6->messages.active) return; if (kfifo_avail(&line6->messages.fifo) >= line6->message_length) { /* No race condition here, there's only one writer */ kfifo_in(&line6->messages.fifo, line6->buffer_message, line6->message_length); } /* else TODO: signal overflow */ wake_up_interruptible(&line6->messages.wait_queue); } static int line6_hwdep_init(struct usb_line6 *line6) { int err; struct snd_hwdep *hwdep; /* TODO: usb_driver_claim_interface(); */ line6->process_message = line6_hwdep_push_message; line6->messages.active = 0; init_waitqueue_head(&line6->messages.wait_queue); mutex_init(&line6->messages.read_lock); INIT_KFIFO(line6->messages.fifo); err = snd_hwdep_new(line6->card, "config", 0, &hwdep); if (err < 0) goto end; strcpy(hwdep->name, "config"); hwdep->iface = SNDRV_HWDEP_IFACE_LINE6; hwdep->ops = hwdep_ops; hwdep->private_data = line6; hwdep->exclusive = true; end: return err; } static int line6_init_cap_control(struct usb_line6 *line6) { int ret; /* initialize USB buffers: */ line6->buffer_listen = kzalloc(LINE6_BUFSIZE_LISTEN, GFP_KERNEL); if (!line6->buffer_listen) return -ENOMEM; line6->urb_listen = usb_alloc_urb(0, GFP_KERNEL); if (!line6->urb_listen) return -ENOMEM; if (line6->properties->capabilities & LINE6_CAP_CONTROL_MIDI) { line6->buffer_message = kzalloc(LINE6_MIDI_MESSAGE_MAXLEN, GFP_KERNEL); if (!line6->buffer_message) return -ENOMEM; ret = line6_init_midi(line6); if (ret < 0) return ret; } else { ret = line6_hwdep_init(line6); if (ret < 0) return ret; } ret = line6_start_listen(line6); if (ret < 0) { dev_err(line6->ifcdev, "cannot start listening: %d\n", ret); return ret; } return 0; } static void line6_startup_work(struct work_struct *work) { struct usb_line6 *line6 = container_of(work, struct usb_line6, startup_work.work); if (line6->startup) line6->startup(line6); } /* Probe USB device. */ int line6_probe(struct usb_interface *interface, const struct usb_device_id *id, const char *driver_name, const struct line6_properties *properties, int (*private_init)(struct usb_line6 *, const struct usb_device_id *id), size_t data_size) { struct usb_device *usbdev = interface_to_usbdev(interface); struct snd_card *card; struct usb_line6 *line6; int interface_number; int ret; if (WARN_ON(data_size < sizeof(*line6))) return -EINVAL; /* we don't handle multiple configurations */ if (usbdev->descriptor.bNumConfigurations != 1) return -ENODEV; ret = snd_card_new(&interface->dev, SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1, THIS_MODULE, data_size, &card); if (ret < 0) return ret; /* store basic data: */ line6 = card->private_data; line6->card = card; line6->properties = properties; line6->usbdev = usbdev; line6->ifcdev = &interface->dev; INIT_DELAYED_WORK(&line6->startup_work, line6_startup_work); strcpy(card->id, properties->id); strcpy(card->driver, driver_name); strcpy(card->shortname, properties->name); sprintf(card->longname, "Line 6 %s at USB %s", properties->name, dev_name(line6->ifcdev)); card->private_free = line6_destruct; usb_set_intfdata(interface, line6); /* increment reference counters: */ usb_get_dev(usbdev); /* initialize device info: */ dev_info(&interface->dev, "Line 6 %s found\n", properties->name); /* query interface number */ interface_number = interface->cur_altsetting->desc.bInterfaceNumber; /* TODO reserves the bus bandwidth even without actual transfer */ ret = usb_set_interface(usbdev, interface_number, properties->altsetting); if (ret < 0) { dev_err(&interface->dev, "set_interface failed\n"); goto error; } line6_get_usb_properties(line6); if (properties->capabilities & LINE6_CAP_CONTROL) { ret = line6_init_cap_control(line6); if (ret < 0) goto error; } /* initialize device data based on device: */ ret = private_init(line6, id); if (ret < 0) goto error; /* creation of additional special files should go here */ dev_info(&interface->dev, "Line 6 %s now attached\n", properties->name); return 0; error: /* we can call disconnect callback here because no close-sync is * needed yet at this point */ line6_disconnect(interface); return ret; } EXPORT_SYMBOL_GPL(line6_probe); /* Line 6 device disconnected. */ void line6_disconnect(struct usb_interface *interface) { struct usb_line6 *line6 = usb_get_intfdata(interface); struct usb_device *usbdev = interface_to_usbdev(interface); if (!line6) return; if (WARN_ON(usbdev != line6->usbdev)) return; cancel_delayed_work_sync(&line6->startup_work); if (line6->urb_listen != NULL) line6_stop_listen(line6); snd_card_disconnect(line6->card); if (line6->line6pcm) line6_pcm_disconnect(line6->line6pcm); if (line6->disconnect) line6->disconnect(line6); dev_info(&interface->dev, "Line 6 %s now disconnected\n", line6->properties->name); /* make sure the device isn't destructed twice: */ usb_set_intfdata(interface, NULL); snd_card_free_when_closed(line6->card); } EXPORT_SYMBOL_GPL(line6_disconnect); #ifdef CONFIG_PM /* Suspend Line 6 device. */ int line6_suspend(struct usb_interface *interface, pm_message_t message) { struct usb_line6 *line6 = usb_get_intfdata(interface); struct snd_line6_pcm *line6pcm = line6->line6pcm; snd_power_change_state(line6->card, SNDRV_CTL_POWER_D3hot); if (line6->properties->capabilities & LINE6_CAP_CONTROL) line6_stop_listen(line6); if (line6pcm != NULL) line6pcm->flags = 0; return 0; } EXPORT_SYMBOL_GPL(line6_suspend); /* Resume Line 6 device. */ int line6_resume(struct usb_interface *interface) { struct usb_line6 *line6 = usb_get_intfdata(interface); if (line6->properties->capabilities & LINE6_CAP_CONTROL) line6_start_listen(line6); snd_power_change_state(line6->card, SNDRV_CTL_POWER_D0); return 0; } EXPORT_SYMBOL_GPL(line6_resume); #endif /* CONFIG_PM */ MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 // SPDX-License-Identifier: GPL-2.0-or-later /* * ChaCha and XChaCha stream ciphers, including ChaCha20 (RFC7539) * * Copyright (C) 2015 Martin Willi * Copyright (C) 2018 Google LLC */ #include <asm/unaligned.h> #include <crypto/algapi.h> #include <crypto/internal/chacha.h> #include <crypto/internal/skcipher.h> #include <linux/module.h> static int chacha_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, const u8 *iv) { struct skcipher_walk walk; u32 state[16]; int err; err = skcipher_walk_virt(&walk, req, false); chacha_init_generic(state, ctx->key, iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; if (nbytes < walk.total) nbytes = round_down(nbytes, CHACHA_BLOCK_SIZE); chacha_crypt_generic(state, walk.dst.virt.addr, walk.src.virt.addr, nbytes, ctx->nrounds); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } static int crypto_chacha_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); return chacha_stream_xor(req, ctx, req->iv); } static int crypto_xchacha_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); struct chacha_ctx subctx; u32 state[16]; u8 real_iv[16]; /* Compute the subkey given the original key and first 128 nonce bits */ chacha_init_generic(state, ctx->key, req->iv); hchacha_block_generic(state, subctx.key, ctx->nrounds); subctx.nrounds = ctx->nrounds; /* Build the real IV */ memcpy(&real_iv[0], req->iv + 24, 8); /* stream position */ memcpy(&real_iv[8], req->iv + 16, 8); /* remaining 64 nonce bits */ /* Generate the stream and XOR it with the data */ return chacha_stream_xor(req, &subctx, real_iv); } static struct skcipher_alg algs[] = { { .base.cra_name = "chacha20", .base.cra_driver_name = "chacha20-generic", .base.cra_priority = 100, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, .encrypt = crypto_chacha_crypt, .decrypt = crypto_chacha_crypt, }, { .base.cra_name = "xchacha20", .base.cra_driver_name = "xchacha20-generic", .base.cra_priority = 100, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, .encrypt = crypto_xchacha_crypt, .decrypt = crypto_xchacha_crypt, }, { .base.cra_name = "xchacha12", .base.cra_driver_name = "xchacha12-generic", .base.cra_priority = 100, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha12_setkey, .encrypt = crypto_xchacha_crypt, .decrypt = crypto_xchacha_crypt, } }; static int __init chacha_generic_mod_init(void) { return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); } static void __exit chacha_generic_mod_fini(void) { crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } subsys_initcall(chacha_generic_mod_init); module_exit(chacha_generic_mod_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (generic)"); MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20-generic"); MODULE_ALIAS_CRYPTO("xchacha20"); MODULE_ALIAS_CRYPTO("xchacha20-generic"); MODULE_ALIAS_CRYPTO("xchacha12"); MODULE_ALIAS_CRYPTO("xchacha12-generic");
3 2 2 1 2 2 2 2 2 3 2 1 1 1 1 10 4 3 10 11 10 11 10 11 10 10 10 7 5 5 11 6 6 6 6 6 6 6 7 7 6 6 7 15 15 8 15 14 14 13 13 12 1 12 13 13 6 3 3 10 6 6 162 164 162 164 18 147 147 146 164 163 151 1 1 13 2 2 2 1 2 1 1 2 1 1 1 1 1 1 1 1 1 2 2 1 2 2 2 2 1 1 1 1 2 2 1 1 2 2 2 1 1 1 1 1 2 1 1 1 133 29 104 133 2 1 2 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 2 1 1 15 2 2 2 2 1 2 2 2 2 1 7 1 7 7 1 7 7 11 6 16 2 2 1 8 11 11 1 132 29 169 159 161 163 53 53 1 1 1 1 2 2 1 2 1 1 1 2 2 25 26 26 3 23 22 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7 7 6 25 31 31 27 26 9 9 9 9 9 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 BSD socket options interface * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/net/ipv4/ip_sockglue.c * * FIXME: Make the setsockopt code POSIX compliant: That is * * o Truncate getsockopt returns * o Return an optlen of the truncated length if need be * * Changes: * David L Stevens <dlstevens@us.ibm.com>: * - added multicast source filtering API for MLDv2 */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/mroute6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/netfilter.h> #include <linux/slab.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/udp.h> #include <net/udplite.h> #include <net/xfrm.h> #include <net/compat.h> #include <net/seg6.h> #include <linux/uaccess.h> struct ip6_ra_chain *ip6_ra_chain; DEFINE_RWLOCK(ip6_ra_lock); DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount); int ip6_ra_control(struct sock *sk, int sel) { struct ip6_ra_chain *ra, *new_ra, **rap; /* RA packet may be delivered ONLY to IPPROTO_RAW socket */ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW) return -ENOPROTOOPT; new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; if (sel >= 0 && !new_ra) return -ENOMEM; write_lock_bh(&ip6_ra_lock); for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (sel >= 0) { write_unlock_bh(&ip6_ra_lock); kfree(new_ra); return -EADDRINUSE; } *rap = ra->next; write_unlock_bh(&ip6_ra_lock); sock_put(sk); kfree(ra); return 0; } } if (!new_ra) { write_unlock_bh(&ip6_ra_lock); return -ENOBUFS; } new_ra->sk = sk; new_ra->sel = sel; new_ra->next = ra; *rap = new_ra; sock_hold(sk); write_unlock_bh(&ip6_ra_lock); return 0; } struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt) { if (inet_test_bit(IS_ICSK, sk)) { if (opt && !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt, opt); sk_dst_reset(sk); return opt; } static bool setsockopt_needs_rtnl(int optname) { switch (optname) { case IPV6_ADDRFORM: case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: case IPV6_JOIN_ANYCAST: case IPV6_LEAVE_ANYCAST: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: case MCAST_MSFILTER: return true; } return false; } static int copy_group_source_from_sockptr(struct group_source_req *greqs, sockptr_t optval, int optlen) { if (in_compat_syscall()) { struct compat_group_source_req gr32; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; greqs->gsr_interface = gr32.gsr_interface; greqs->gsr_group = gr32.gsr_group; greqs->gsr_source = gr32.gsr_source; } else { if (optlen < sizeof(*greqs)) return -EINVAL; if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) return -EFAULT; } return 0; } static int do_ipv6_mcast_group_source(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct group_source_req greqs; int omode, add; int ret; ret = copy_group_source_from_sockptr(&greqs, optval, optlen); if (ret) return ret; if (greqs.gsr_group.ss_family != AF_INET6 || greqs.gsr_source.ss_family != AF_INET6) return -EADDRNOTAVAIL; if (optname == MCAST_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == MCAST_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct sockaddr_in6 *psin6; int retv; psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, &psin6->sin6_addr, MCAST_INCLUDE); /* prior join w/ different source is ok */ if (retv && retv != -EADDRINUSE) return retv; omode = MCAST_INCLUDE; add = 1; } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } return ip6_mc_source(add, omode, sk, &greqs); } static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { struct group_filter *gsf; int ret; if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); if (IS_ERR(gsf)) return PTR_ERR(gsf); /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; if (gsf->gf_numsrc >= 0x1ffffffU || gsf->gf_numsrc > sysctl_mld_max_msf) goto out_free_gsf; ret = -EINVAL; if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) goto out_free_gsf; ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return ret; } static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; void *p; int ret; int n; if (optlen < size0) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); if (!p) return -ENOMEM; gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ ret = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) goto out_free_p; /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; n = gf32->gf_numsrc; if (n >= 0x1ffffffU || n > sysctl_mld_max_msf) goto out_free_p; ret = -EINVAL; if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_p; ret = ip6_mc_msfilter(sk, &(struct group_filter){ .gf_interface = gf32->gf_interface, .gf_group = gf32->gf_group, .gf_fmode = gf32->gf_fmode, .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex); out_free_p: kfree(p); return ret; } static int ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct sockaddr_in6 *psin6; struct group_req greq; if (optlen < sizeof(greq)) return -EINVAL; if (copy_from_sockptr(&greq, optval, sizeof(greq))) return -EFAULT; if (greq.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&greq.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, greq.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, greq.gr_interface, &psin6->sin6_addr); } static int compat_ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct compat_group_req gr32; struct sockaddr_in6 *psin6; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; if (gr32.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&gr32.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, gr32.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, gr32.gr_interface, &psin6->sin6_addr); } static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_opt_hdr *new = NULL; struct net *net = sock_net(sk); struct ipv6_txoptions *opt; int err; /* hop-by-hop / destination options are privileged option */ if (optname != IPV6_RTHDR && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; /* remove any sticky options header with a zero option * length, per RFC3542. */ if (optlen > 0) { if (sockptr_is_null(optval)) return -EINVAL; if (optlen < sizeof(struct ipv6_opt_hdr) || optlen & 0x7 || optlen > 8 * 255) return -EINVAL; new = memdup_sockptr(optval, optlen); if (IS_ERR(new)) return PTR_ERR(new); if (unlikely(ipv6_optlen(new) > optlen)) { kfree(new); return -EINVAL; } } opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); opt = ipv6_renew_options(sk, opt, optname, new); kfree(new); if (IS_ERR(opt)) return PTR_ERR(opt); /* routing header option needs extra check */ err = -EINVAL; if (optname == IPV6_RTHDR && opt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = opt->srcrt; switch (rthdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) goto sticky_done; break; #endif case IPV6_SRCRT_TYPE_4: { struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt; if (!seg6_validate_srh(srh, optlen, false)) goto sticky_done; break; } default: goto sticky_done; } } err = 0; opt = ipv6_update_options(sk, opt); sticky_done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } return err; } int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int val, valbool; int retv = -ENOPROTOOPT; bool needs_rtnl = setsockopt_needs_rtnl(optname); if (sockptr_is_null(optval)) val = 0; else { if (optlen >= sizeof(int)) { if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else val = 0; } valbool = (val != 0); if (ip6_mroute_opt(optname)) return ip6_mroute_setsockopt(sk, optname, optval, optlen); /* Handle options that can be set without locking the socket. */ switch (optname) { case IPV6_UNICAST_HOPS: if (optlen < sizeof(int)) return -EINVAL; if (val > 255 || val < -1) return -EINVAL; WRITE_ONCE(np->hop_limit, val); return 0; case IPV6_MULTICAST_LOOP: if (optlen < sizeof(int)) return -EINVAL; if (val != valbool) return -EINVAL; inet6_assign_bit(MC6_LOOP, sk, valbool); return 0; case IPV6_MULTICAST_HOPS: if (sk->sk_type == SOCK_STREAM) return retv; if (optlen < sizeof(int)) return -EINVAL; if (val > 255 || val < -1) return -EINVAL; WRITE_ONCE(np->mcast_hops, val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); return 0; case IPV6_MTU: if (optlen < sizeof(int)) return -EINVAL; if (val && val < IPV6_MIN_MTU) return -EINVAL; WRITE_ONCE(np->frag_size, val); return 0; case IPV6_MINHOPCOUNT: if (optlen < sizeof(int)) return -EINVAL; if (val < 0 || val > 255) return -EINVAL; if (val) static_branch_enable(&ip6_min_hopcount); /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount * while we are changing it. */ WRITE_ONCE(np->min_hopcount, val); return 0; case IPV6_RECVERR_RFC4884: if (optlen < sizeof(int)) return -EINVAL; if (val < 0 || val > 1) return -EINVAL; inet6_assign_bit(RECVERR6_RFC4884, sk, valbool); return 0; case IPV6_MULTICAST_ALL: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(MC6_ALL, sk, valbool); return 0; case IPV6_AUTOFLOWLABEL: inet6_assign_bit(AUTOFLOWLABEL, sk, valbool); inet6_set_bit(AUTOFLOWLABEL_SET, sk); return 0; case IPV6_DONTFRAG: inet6_assign_bit(DONTFRAG, sk, valbool); return 0; case IPV6_RECVERR: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(RECVERR6, sk, valbool); if (!val) skb_errqueue_purge(&sk->sk_error_queue); return 0; case IPV6_ROUTER_ALERT_ISOLATE: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(RTALERT_ISOLATE, sk, valbool); return 0; case IPV6_MTU_DISCOVER: if (optlen < sizeof(int)) return -EINVAL; if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT) return -EINVAL; WRITE_ONCE(np->pmtudisc, val); return 0; case IPV6_FLOWINFO_SEND: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(SNDFLOW, sk, valbool); return 0; case IPV6_ADDR_PREFERENCES: if (optlen < sizeof(int)) return -EINVAL; return ip6_sock_set_addr_preferences(sk, val); case IPV6_MULTICAST_IF: if (sk->sk_type == SOCK_STREAM) return -ENOPROTOOPT; if (optlen < sizeof(int)) return -EINVAL; if (val) { struct net_device *dev; int bound_dev_if, midx; rcu_read_lock(); dev = dev_get_by_index_rcu(net, val); if (!dev) { rcu_read_unlock(); return -ENODEV; } midx = l3mdev_master_ifindex_rcu(dev); rcu_read_unlock(); bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && bound_dev_if != val && (!midx || midx != bound_dev_if)) return -EINVAL; } WRITE_ONCE(np->mcast_oif, val); return 0; case IPV6_UNICAST_IF: { struct net_device *dev; int ifindex; if (optlen != sizeof(int)) return -EINVAL; ifindex = (__force int)ntohl((__force __be32)val); if (!ifindex) { WRITE_ONCE(np->ucast_oif, 0); return 0; } dev = dev_get_by_index(net, ifindex); if (!dev) return -EADDRNOTAVAIL; dev_put(dev); if (READ_ONCE(sk->sk_bound_dev_if)) return -EINVAL; WRITE_ONCE(np->ucast_oif, ifindex); return 0; } } if (needs_rtnl) rtnl_lock(); sockopt_lock_sock(sk); /* Another thread has converted the socket into IPv4 with * IPV6_ADDRFORM concurrently. */ if (unlikely(sk->sk_family != AF_INET6)) goto unlock; switch (optname) { case IPV6_ADDRFORM: if (optlen < sizeof(int)) goto e_inval; if (val == PF_INET) { if (sk->sk_type == SOCK_RAW) break; if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) { struct udp_sock *up = udp_sk(sk); if (up->pending == AF_INET6) { retv = -EBUSY; break; } } else if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_prot != &tcpv6_prot) { retv = -EBUSY; break; } } else { break; } if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; } if (ipv6_only_sock(sk) || !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { retv = -EADDRNOTAVAIL; break; } __ipv6_sock_mc_close(sk); __ipv6_sock_ac_close(sk); if (sk->sk_protocol == IPPROTO_TCP) { struct inet_connection_sock *icsk = inet_csk(sk); sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */ WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); WRITE_ONCE(sk->sk_socket->ops, &inet_stream_ops); WRITE_ONCE(sk->sk_family, PF_INET); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct proto *prot = &udp_prot; if (sk->sk_protocol == IPPROTO_UDPLITE) prot = &udplite_prot; sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); WRITE_ONCE(sk->sk_socket->ops, &inet_dgram_ops); WRITE_ONCE(sk->sk_family, PF_INET); } /* Disable all options not to allocate memory anymore, * but there is still a race. See the lockless path * in udpv6_sendmsg() and ipv6_local_rxpmtu(). */ np->rxopt.all = 0; inet6_cleanup_sock(sk); module_put(THIS_MODULE); retv = 0; break; } goto e_inval; case IPV6_V6ONLY: if (optlen < sizeof(int) || inet_sk(sk)->inet_num) goto e_inval; sk->sk_ipv6only = valbool; retv = 0; break; case IPV6_RECVPKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxinfo = valbool; retv = 0; break; case IPV6_2292PKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxoinfo = valbool; retv = 0; break; case IPV6_RECVHOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxhlim = valbool; retv = 0; break; case IPV6_2292HOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxohlim = valbool; retv = 0; break; case IPV6_RECVRTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.srcrt = valbool; retv = 0; break; case IPV6_2292RTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.osrcrt = valbool; retv = 0; break; case IPV6_RECVHOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.hopopts = valbool; retv = 0; break; case IPV6_2292HOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.ohopopts = valbool; retv = 0; break; case IPV6_RECVDSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.dstopts = valbool; retv = 0; break; case IPV6_2292DSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.odstopts = valbool; retv = 0; break; case IPV6_TCLASS: if (optlen < sizeof(int)) goto e_inval; if (val < -1 || val > 0xff) goto e_inval; /* RFC 3542, 6.5: default traffic class of 0x0 */ if (val == -1) val = 0; if (sk->sk_type == SOCK_STREAM) { val &= ~INET_ECN_MASK; val |= np->tclass & INET_ECN_MASK; } if (np->tclass != val) { np->tclass = val; sk_dst_reset(sk); } retv = 0; break; case IPV6_RECVTCLASS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxtclass = valbool; retv = 0; break; case IPV6_FLOWINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxflow = valbool; retv = 0; break; case IPV6_RECVPATHMTU: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxpmtu = valbool; retv = 0; break; case IPV6_TRANSPARENT: if (valbool && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) { retv = -EPERM; break; } if (optlen < sizeof(int)) goto e_inval; /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ inet_assign_bit(TRANSPARENT, sk, valbool); retv = 0; break; case IPV6_FREEBIND: if (optlen < sizeof(int)) goto e_inval; /* we also don't have a separate freebind bit for IPV6 */ inet_assign_bit(FREEBIND, sk, valbool); retv = 0; break; case IPV6_RECVORIGDSTADDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxorigdstaddr = valbool; retv = 0; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: retv = ipv6_set_opt_hdr(sk, optname, optval, optlen); break; case IPV6_PKTINFO: { struct in6_pktinfo pkt; if (optlen == 0) goto e_inval; else if (optlen < sizeof(struct in6_pktinfo) || sockptr_is_null(optval)) goto e_inval; if (copy_from_sockptr(&pkt, optval, sizeof(pkt))) { retv = -EFAULT; break; } if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; np->sticky_pktinfo.ipi6_addr = pkt.ipi6_addr; retv = 0; break; } case IPV6_2292PKTOPTIONS: { struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi6 fl6; struct ipcm6_cookie ipc6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; if (optlen == 0) goto update; /* 1K is probably excessive * 1K is surely not enough, 2K per standard header is 16K. */ retv = -EINVAL; if (optlen > 64*1024) break; opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); retv = -ENOBUFS; if (!opt) break; memset(opt, 0, sizeof(*opt)); refcount_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; if (copy_from_sockptr(opt + 1, optval, optlen)) goto done; msg.msg_controllen = optlen; msg.msg_control_is_user = false; msg.msg_control = (void *)(opt+1); ipc6.opt = opt; retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6); if (retv) goto done; update: retv = 0; opt = ipv6_update_options(sk, opt); done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } break; } case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EPROTO; if (inet_test_bit(IS_ICSK, sk)) break; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_ADD_MEMBERSHIP) retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); else retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); break; } case IPV6_JOIN_ANYCAST: case IPV6_LEAVE_ANYCAST: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_JOIN_ANYCAST) retv = ipv6_sock_ac_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); else retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); break; } case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: if (in_compat_syscall()) retv = compat_ipv6_mcast_join_leave(sk, optname, optval, optlen); else retv = ipv6_mcast_join_leave(sk, optname, optval, optlen); break; case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: retv = do_ipv6_mcast_group_source(sk, optname, optval, optlen); break; case MCAST_MSFILTER: if (in_compat_syscall()) retv = compat_ipv6_set_mcast_msfilter(sk, optval, optlen); else retv = ipv6_set_mcast_msfilter(sk, optval, optlen); break; case IPV6_ROUTER_ALERT: if (optlen < sizeof(int)) goto e_inval; retv = ip6_ra_control(sk, val); if (retv == 0) inet6_assign_bit(RTALERT, sk, valbool); break; case IPV6_FLOWLABEL_MGR: retv = ipv6_flowlabel_opt(sk, optval, optlen); break; case IPV6_IPSEC_POLICY: case IPV6_XFRM_POLICY: retv = -EPERM; if (!sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) break; retv = xfrm_user_policy(sk, optname, optval, optlen); break; case IPV6_RECVFRAGSIZE: np->rxopt.bits.recvfragsize = valbool; retv = 0; break; } unlock: sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return retv; e_inval: retv = -EINVAL; goto unlock; } int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return udp_prot.setsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && optname != IPV6_XFRM_POLICY) err = nf_setsockopt(sk, PF_INET6, optname, optval, optlen); #endif return err; } EXPORT_SYMBOL(ipv6_setsockopt); static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, int optname, sockptr_t optval, int len) { struct ipv6_opt_hdr *hdr; if (!opt) return 0; switch (optname) { case IPV6_HOPOPTS: hdr = opt->hopopt; break; case IPV6_RTHDRDSTOPTS: hdr = opt->dst0opt; break; case IPV6_RTHDR: hdr = (struct ipv6_opt_hdr *)opt->srcrt; break; case IPV6_DSTOPTS: hdr = opt->dst1opt; break; default: return -EINVAL; /* should not happen */ } if (!hdr) return 0; len = min_t(unsigned int, len, ipv6_optlen(hdr)); if (copy_to_sockptr(optval, hdr, len)) return -EFAULT; return len; } static int ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct group_filter, gf_slist_flex); struct group_filter gsf; int num; int err; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gsf, optval, size0)) return -EFAULT; if (gsf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; num = gsf.gf_numsrc; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gsf, optval, size0); if (!err) { if (num > gsf.gf_numsrc) num = gsf.gf_numsrc; len = GROUP_FILTER_SIZE(num); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr(optval, &gsf, size0)) err = -EFAULT; } sockopt_release_sock(sk); return err; } static int compat_ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter gf32; struct group_filter gf; int err; int num; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gf32, optval, size0)) return -EFAULT; gf.gf_interface = gf32.gf_interface; gf.gf_fmode = gf32.gf_fmode; num = gf.gf_numsrc = gf32.gf_numsrc; gf.gf_group = gf32.gf_group; if (gf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gf, optval, size0); sockopt_release_sock(sk); if (err) return err; if (num > gf.gf_numsrc) num = gf.gf_numsrc; len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode), &gf.gf_fmode, sizeof(gf32.gf_fmode)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc), &gf.gf_numsrc, sizeof(gf32.gf_numsrc))) return -EFAULT; return 0; } int do_ipv6_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct ipv6_pinfo *np = inet6_sk(sk); int len; int val; if (ip6_mroute_opt(optname)) return ip6_mroute_getsockopt(sk, optname, optval, optlen); if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; switch (optname) { case IPV6_ADDRFORM: if (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_UDPLITE && sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; val = sk->sk_family; break; case MCAST_MSFILTER: if (in_compat_syscall()) return compat_ipv6_get_msfilter(sk, optval, optlen, len); return ipv6_get_msfilter(sk, optval, optlen, len); case IPV6_2292PKTOPTIONS: { struct msghdr msg; struct sk_buff *skb; if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; if (optval.is_kernel) { msg.msg_control_is_user = false; msg.msg_control = optval.kernel; } else { msg.msg_control_is_user = true; msg.msg_control_user = optval.user; } msg.msg_controllen = len; msg.msg_flags = 0; sockopt_lock_sock(sk); skb = np->pktoptions; if (skb) ip6_datagram_recv_ctl(sk, &msg, skb); sockopt_release_sock(sk); if (!skb) { if (np->rxopt.bits.rxinfo) { int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxhlim) { int hlim = READ_ONCE(np->mcast_hops); put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { int tclass = (int)ip6_tclass(np->rcv_flowinfo); put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxoinfo) { int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { int hlim = READ_ONCE(np->mcast_hops); put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxflow) { __be32 flowinfo = np->rcv_flowinfo; put_cmsg(&msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } } len -= msg.msg_controllen; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_MTU: { struct dst_entry *dst; val = 0; rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) val = dst_mtu(dst); rcu_read_unlock(); if (!val) return -ENOTCONN; break; } case IPV6_V6ONLY: val = sk->sk_ipv6only; break; case IPV6_RECVPKTINFO: val = np->rxopt.bits.rxinfo; break; case IPV6_2292PKTINFO: val = np->rxopt.bits.rxoinfo; break; case IPV6_RECVHOPLIMIT: val = np->rxopt.bits.rxhlim; break; case IPV6_2292HOPLIMIT: val = np->rxopt.bits.rxohlim; break; case IPV6_RECVRTHDR: val = np->rxopt.bits.srcrt; break; case IPV6_2292RTHDR: val = np->rxopt.bits.osrcrt; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: { struct ipv6_txoptions *opt; sockopt_lock_sock(sk); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len); sockopt_release_sock(sk); /* check if ipv6_getsockopt_sticky() returns err code */ if (len < 0) return len; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_RECVHOPOPTS: val = np->rxopt.bits.hopopts; break; case IPV6_2292HOPOPTS: val = np->rxopt.bits.ohopopts; break; case IPV6_RECVDSTOPTS: val = np->rxopt.bits.dstopts; break; case IPV6_2292DSTOPTS: val = np->rxopt.bits.odstopts; break; case IPV6_TCLASS: val = np->tclass; break; case IPV6_RECVTCLASS: val = np->rxopt.bits.rxtclass; break; case IPV6_FLOWINFO: val = np->rxopt.bits.rxflow; break; case IPV6_RECVPATHMTU: val = np->rxopt.bits.rxpmtu; break; case IPV6_PATHMTU: { struct dst_entry *dst; struct ip6_mtuinfo mtuinfo; if (len < sizeof(mtuinfo)) return -EINVAL; len = sizeof(mtuinfo); memset(&mtuinfo, 0, sizeof(mtuinfo)); rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) mtuinfo.ip6m_mtu = dst_mtu(dst); rcu_read_unlock(); if (!mtuinfo.ip6m_mtu) return -ENOTCONN; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &mtuinfo, len)) return -EFAULT; return 0; } case IPV6_TRANSPARENT: val = inet_test_bit(TRANSPARENT, sk); break; case IPV6_FREEBIND: val = inet_test_bit(FREEBIND, sk); break; case IPV6_RECVORIGDSTADDR: val = np->rxopt.bits.rxorigdstaddr; break; case IPV6_UNICAST_HOPS: case IPV6_MULTICAST_HOPS: { struct dst_entry *dst; if (optname == IPV6_UNICAST_HOPS) val = READ_ONCE(np->hop_limit); else val = READ_ONCE(np->mcast_hops); if (val < 0) { rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) val = ip6_dst_hoplimit(dst); rcu_read_unlock(); } if (val < 0) val = READ_ONCE(sock_net(sk)->ipv6.devconf_all->hop_limit); break; } case IPV6_MULTICAST_LOOP: val = inet6_test_bit(MC6_LOOP, sk); break; case IPV6_MULTICAST_IF: val = READ_ONCE(np->mcast_oif); break; case IPV6_MULTICAST_ALL: val = inet6_test_bit(MC6_ALL, sk); break; case IPV6_UNICAST_IF: val = (__force int)htonl((__u32) READ_ONCE(np->ucast_oif)); break; case IPV6_MTU_DISCOVER: val = READ_ONCE(np->pmtudisc); break; case IPV6_RECVERR: val = inet6_test_bit(RECVERR6, sk); break; case IPV6_FLOWINFO_SEND: val = inet6_test_bit(SNDFLOW, sk); break; case IPV6_FLOWLABEL_MGR: { struct in6_flowlabel_req freq; int flags; if (len < sizeof(freq)) return -EINVAL; if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; if (freq.flr_action != IPV6_FL_A_GET) return -EINVAL; len = sizeof(freq); flags = freq.flr_flags; memset(&freq, 0, sizeof(freq)); val = ipv6_flowlabel_opt_get(sk, &freq, flags); if (val < 0) return val; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &freq, len)) return -EFAULT; return 0; } case IPV6_ADDR_PREFERENCES: { u8 srcprefs = READ_ONCE(np->srcprefs); val = 0; if (srcprefs & IPV6_PREFER_SRC_TMP) val |= IPV6_PREFER_SRC_TMP; else if (srcprefs & IPV6_PREFER_SRC_PUBLIC) val |= IPV6_PREFER_SRC_PUBLIC; else { /* XXX: should we return system default? */ val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT; } if (srcprefs & IPV6_PREFER_SRC_COA) val |= IPV6_PREFER_SRC_COA; else val |= IPV6_PREFER_SRC_HOME; break; } case IPV6_MINHOPCOUNT: val = READ_ONCE(np->min_hopcount); break; case IPV6_DONTFRAG: val = inet6_test_bit(DONTFRAG, sk); break; case IPV6_AUTOFLOWLABEL: val = ip6_autoflowlabel(sock_net(sk), sk); break; case IPV6_RECVFRAGSIZE: val = np->rxopt.bits.recvfragsize; break; case IPV6_ROUTER_ALERT: val = inet6_test_bit(RTALERT, sk); break; case IPV6_ROUTER_ALERT_ISOLATE: val = inet6_test_bit(RTALERT_ISOLATE, sk); break; case IPV6_RECVERR_RFC4884: val = inet6_test_bit(RECVERR6_RFC4884, sk); break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return udp_prot.getsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { int len; if (get_user(len, optlen)) return -EFAULT; err = nf_getsockopt(sk, PF_INET6, optname, optval, &len); if (err >= 0) err = put_user(len, optlen); } #endif return err; } EXPORT_SYMBOL(ipv6_getsockopt);
4 4 4 4 4 4 4 4 4 4 2 3 2 3 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <asm/unaligned.h> #include <net/tcp.h> #include <net/netns/generic.h> #include <linux/proc_fs.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter/nf_synproxy.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_synproxy.h> unsigned int synproxy_net_id; EXPORT_SYMBOL_GPL(synproxy_net_id); bool synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, const struct tcphdr *th, struct synproxy_options *opts) { int length = (th->doff * 4) - sizeof(*th); u8 buf[40], *ptr; if (unlikely(length < 0)) return false; ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf); if (ptr == NULL) return false; opts->options = 0; while (length > 0) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return true; case TCPOPT_NOP: length--; continue; default: if (length < 2) return true; opsize = *ptr++; if (opsize < 2) return true; if (opsize > length) return true; switch (opcode) { case TCPOPT_MSS: if (opsize == TCPOLEN_MSS) { opts->mss_option = get_unaligned_be16(ptr); opts->options |= NF_SYNPROXY_OPT_MSS; } break; case TCPOPT_WINDOW: if (opsize == TCPOLEN_WINDOW) { opts->wscale = *ptr; if (opts->wscale > TCP_MAX_WSCALE) opts->wscale = TCP_MAX_WSCALE; opts->options |= NF_SYNPROXY_OPT_WSCALE; } break; case TCPOPT_TIMESTAMP: if (opsize == TCPOLEN_TIMESTAMP) { opts->tsval = get_unaligned_be32(ptr); opts->tsecr = get_unaligned_be32(ptr + 4); opts->options |= NF_SYNPROXY_OPT_TIMESTAMP; } break; case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM) opts->options |= NF_SYNPROXY_OPT_SACK_PERM; break; } ptr += opsize - 2; length -= opsize; } } return true; } EXPORT_SYMBOL_GPL(synproxy_parse_options); static unsigned int synproxy_options_size(const struct synproxy_options *opts) { unsigned int size = 0; if (opts->options & NF_SYNPROXY_OPT_MSS) size += TCPOLEN_MSS_ALIGNED; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) size += TCPOLEN_TSTAMP_ALIGNED; else if (opts->options & NF_SYNPROXY_OPT_SACK_PERM) size += TCPOLEN_SACKPERM_ALIGNED; if (opts->options & NF_SYNPROXY_OPT_WSCALE) size += TCPOLEN_WSCALE_ALIGNED; return size; } static void synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) { __be32 *ptr = (__be32 *)(th + 1); u8 options = opts->options; if (options & NF_SYNPROXY_OPT_MSS) *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | opts->mss_option); if (options & NF_SYNPROXY_OPT_TIMESTAMP) { if (options & NF_SYNPROXY_OPT_SACK_PERM) *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); else *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); *ptr++ = htonl(opts->tsval); *ptr++ = htonl(opts->tsecr); } else if (options & NF_SYNPROXY_OPT_SACK_PERM) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); if (options & NF_SYNPROXY_OPT_WSCALE) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->wscale); } void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info, struct synproxy_options *opts) { opts->tsecr = opts->tsval; opts->tsval = tcp_clock_ms() & ~0x3f; if (opts->options & NF_SYNPROXY_OPT_WSCALE) { opts->tsval |= opts->wscale; opts->wscale = info->wscale; } else opts->tsval |= 0xf; if (opts->options & NF_SYNPROXY_OPT_SACK_PERM) opts->tsval |= 1 << 4; if (opts->options & NF_SYNPROXY_OPT_ECN) opts->tsval |= 1 << 5; } EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie); static void synproxy_check_timestamp_cookie(struct synproxy_options *opts) { opts->wscale = opts->tsecr & 0xf; if (opts->wscale != 0xf) opts->options |= NF_SYNPROXY_OPT_WSCALE; opts->options |= opts->tsecr & (1 << 4) ? NF_SYNPROXY_OPT_SACK_PERM : 0; opts->options |= opts->tsecr & (1 << 5) ? NF_SYNPROXY_OPT_ECN : 0; } static unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff, struct tcphdr *th, struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct nf_conn_synproxy *synproxy) { unsigned int optoff, optend; __be32 *ptr, old; if (synproxy->tsoff == 0) return 1; optoff = protoff + sizeof(struct tcphdr); optend = protoff + th->doff * 4; if (skb_ensure_writable(skb, optend)) return 0; while (optoff < optend) { unsigned char *op = skb->data + optoff; switch (op[0]) { case TCPOPT_EOL: return 1; case TCPOPT_NOP: optoff++; continue; default: if (optoff + 1 == optend || optoff + op[1] > optend || op[1] < 2) return 0; if (op[0] == TCPOPT_TIMESTAMP && op[1] == TCPOLEN_TIMESTAMP) { if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { ptr = (__be32 *)&op[2]; old = *ptr; *ptr = htonl(ntohl(*ptr) - synproxy->tsoff); } else { ptr = (__be32 *)&op[6]; old = *ptr; *ptr = htonl(ntohl(*ptr) + synproxy->tsoff); } inet_proto_csum_replace4(&th->check, skb, old, *ptr, false); return 1; } optoff += op[1]; } } return 1; } #ifdef CONFIG_PROC_FS static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos) { struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; return per_cpu_ptr(snet->stats, cpu); } return NULL; } static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; cpu++) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; return per_cpu_ptr(snet->stats, cpu); } (*pos)++; return NULL; } static void synproxy_cpu_seq_stop(struct seq_file *seq, void *v) { return; } static int synproxy_cpu_seq_show(struct seq_file *seq, void *v) { struct synproxy_stats *stats = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries\t\tsyn_received\t" "cookie_invalid\tcookie_valid\t" "cookie_retrans\tconn_reopened\n"); return 0; } seq_printf(seq, "%08x\t%08x\t%08x\t%08x\t%08x\t%08x\n", 0, stats->syn_received, stats->cookie_invalid, stats->cookie_valid, stats->cookie_retrans, stats->conn_reopened); return 0; } static const struct seq_operations synproxy_cpu_seq_ops = { .start = synproxy_cpu_seq_start, .next = synproxy_cpu_seq_next, .stop = synproxy_cpu_seq_stop, .show = synproxy_cpu_seq_show, }; static int __net_init synproxy_proc_init(struct net *net) { if (!proc_create_net("synproxy", 0444, net->proc_net_stat, &synproxy_cpu_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; return 0; } static void __net_exit synproxy_proc_exit(struct net *net) { remove_proc_entry("synproxy", net->proc_net_stat); } #else static int __net_init synproxy_proc_init(struct net *net) { return 0; } static void __net_exit synproxy_proc_exit(struct net *net) { return; } #endif /* CONFIG_PROC_FS */ static int __net_init synproxy_net_init(struct net *net) { struct synproxy_net *snet = synproxy_pernet(net); struct nf_conn *ct; int err = -ENOMEM; ct = nf_ct_tmpl_alloc(net, &nf_ct_zone_dflt, GFP_KERNEL); if (!ct) goto err1; if (!nfct_seqadj_ext_add(ct)) goto err2; if (!nfct_synproxy_ext_add(ct)) goto err2; __set_bit(IPS_CONFIRMED_BIT, &ct->status); snet->tmpl = ct; snet->stats = alloc_percpu(struct synproxy_stats); if (snet->stats == NULL) goto err2; err = synproxy_proc_init(net); if (err < 0) goto err3; return 0; err3: free_percpu(snet->stats); err2: nf_ct_tmpl_free(ct); err1: return err; } static void __net_exit synproxy_net_exit(struct net *net) { struct synproxy_net *snet = synproxy_pernet(net); nf_ct_put(snet->tmpl); synproxy_proc_exit(net); free_percpu(snet->stats); } static struct pernet_operations synproxy_net_ops = { .init = synproxy_net_init, .exit = synproxy_net_exit, .id = &synproxy_net_id, .size = sizeof(struct synproxy_net), }; static int __init synproxy_core_init(void) { return register_pernet_subsys(&synproxy_net_ops); } static void __exit synproxy_core_exit(void) { unregister_pernet_subsys(&synproxy_net_ops); } module_init(synproxy_core_init); module_exit(synproxy_core_exit); static struct iphdr * synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct iphdr *iph; skb_reset_network_header(skb); iph = skb_put(skb, sizeof(*iph)); iph->version = 4; iph->ihl = sizeof(*iph) / 4; iph->tos = 0; iph->id = 0; iph->frag_off = htons(IP_DF); iph->ttl = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); iph->protocol = IPPROTO_TCP; iph->check = 0; iph->saddr = saddr; iph->daddr = daddr; return iph; } static void synproxy_send_tcp(struct net *net, const struct sk_buff *skb, struct sk_buff *nskb, struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, struct iphdr *niph, struct tcphdr *nth, unsigned int tcp_hdr_size) { nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum_start = (unsigned char *)nth - nskb->head; nskb->csum_offset = offsetof(struct tcphdr, check); skb_dst_set_noref(nskb, skb_dst(skb)); nskb->protocol = htons(ETH_P_IP); if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC)) goto free_nskb; if (nfct) { nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); nf_conntrack_get(nfct); } ip_local_out(net, nskb->sk, nskb); return; free_nskb: kfree_skb(nskb); } void synproxy_send_client_synack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; struct iphdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; u16 mss = opts->mss_encode; iph = ip_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->dest; nth->dest = th->source; nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); nth->ack_seq = htonl(ntohl(th->seq) + 1); tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; if (opts->options & NF_SYNPROXY_OPT_ECN) tcp_flag_word(nth) |= TCP_FLAG_ECE; nth->doff = tcp_hdr_size / 4; nth->window = 0; nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } EXPORT_SYMBOL_GPL(synproxy_send_client_synack); static void synproxy_send_server_syn(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts, u32 recv_seq) { struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *nskb; struct iphdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; iph = ip_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->source; nth->dest = th->dest; nth->seq = htonl(recv_seq - 1); /* ack_seq is used to relay our ISN to the synproxy hook to initialize * sequence number translation once a connection tracking entry exists. */ nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); tcp_flag_word(nth) = TCP_FLAG_SYN; if (opts->options & NF_SYNPROXY_OPT_ECN) tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; nth->doff = tcp_hdr_size / 4; nth->window = th->window; nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, niph, nth, tcp_hdr_size); } static void synproxy_send_server_ack(struct net *net, const struct ip_ct_tcp *state, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; struct iphdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; iph = ip_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->dest; nth->dest = th->source; nth->seq = htonl(ntohl(th->ack_seq)); nth->ack_seq = htonl(ntohl(th->seq) + 1); tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); } static void synproxy_send_client_ack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; struct iphdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; iph = ip_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->source; nth->dest = th->dest; nth->seq = htonl(ntohl(th->seq) + 1); nth->ack_seq = th->ack_seq; tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; nth->window = htons(ntohs(th->window) >> opts->wscale); nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } bool synproxy_recv_client_ack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, struct synproxy_options *opts, u32 recv_seq) { struct synproxy_net *snet = synproxy_pernet(net); int mss; mss = __cookie_v4_check(ip_hdr(skb), th); if (mss == 0) { this_cpu_inc(snet->stats->cookie_invalid); return false; } this_cpu_inc(snet->stats->cookie_valid); opts->mss_option = mss; opts->options |= NF_SYNPROXY_OPT_MSS; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy_check_timestamp_cookie(opts); synproxy_send_server_syn(net, skb, th, opts, recv_seq); return true; } EXPORT_SYMBOL_GPL(synproxy_recv_client_ack); unsigned int ipv4_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs) { struct net *net = nhs->net; struct synproxy_net *snet = synproxy_pernet(net); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; struct synproxy_options opts = {}; const struct ip_ct_tcp *state; struct tcphdr *th, _th; unsigned int thoff; ct = nf_ct_get(skb, &ctinfo); if (!ct) return NF_ACCEPT; synproxy = nfct_synproxy(ct); if (!synproxy) return NF_ACCEPT; if (nf_is_loopback_packet(skb) || ip_hdr(skb)->protocol != IPPROTO_TCP) return NF_ACCEPT; thoff = ip_hdrlen(skb); th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); if (!th) return NF_DROP; state = &ct->proto.tcp; switch (state->state) { case TCP_CONNTRACK_CLOSE: if (th->rst && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq) + 1); break; } if (!th->syn || th->ack || CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) break; /* Reopened connection - reset the sequence number and timestamp * adjustments, they will get initialized once the connection is * reestablished. */ nf_ct_seqadj_init(ct, ctinfo, 0); synproxy->tsoff = 0; this_cpu_inc(snet->stats->conn_reopened); fallthrough; case TCP_CONNTRACK_SYN_SENT: if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; if (!th->syn && th->ack && CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, * therefore we need to add 1 to make the SYN sequence * number match the one of first SYN. */ if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq) + 1)) { this_cpu_inc(snet->stats->cookie_retrans); consume_skb(skb); return NF_STOLEN; } else { return NF_DROP; } } synproxy->isn = ntohl(th->ack_seq); if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy->its = opts.tsecr; nf_conntrack_event_cache(IPCT_SYNPROXY, ct); break; case TCP_CONNTRACK_SYN_RECV: if (!th->syn || !th->ack) break; if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) { synproxy->tsoff = opts.tsval - synproxy->its; nf_conntrack_event_cache(IPCT_SYNPROXY, ct); } opts.options &= ~(NF_SYNPROXY_OPT_MSS | NF_SYNPROXY_OPT_WSCALE | NF_SYNPROXY_OPT_SACK_PERM); swap(opts.tsval, opts.tsecr); synproxy_send_server_ack(net, state, skb, th, &opts); nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); nf_conntrack_event_cache(IPCT_SEQADJ, ct); swap(opts.tsval, opts.tsecr); synproxy_send_client_ack(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; default: break; } synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); return NF_ACCEPT; } EXPORT_SYMBOL_GPL(ipv4_synproxy_hook); static const struct nf_hook_ops ipv4_synproxy_ops[] = { { .hook = ipv4_synproxy_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, }, { .hook = ipv4_synproxy_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, }, }; int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net) { int err; if (snet->hook_ref4 == 0) { err = nf_register_net_hooks(net, ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); if (err) return err; } snet->hook_ref4++; return 0; } EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_init); void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net) { snet->hook_ref4--; if (snet->hook_ref4 == 0) nf_unregister_net_hooks(net, ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); } EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_fini); #if IS_ENABLED(CONFIG_IPV6) static struct ipv6hdr * synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr) { struct ipv6hdr *iph; skb_reset_network_header(skb); iph = skb_put(skb, sizeof(*iph)); ip6_flow_hdr(iph, 0, 0); iph->hop_limit = READ_ONCE(net->ipv6.devconf_all->hop_limit); iph->nexthdr = IPPROTO_TCP; iph->saddr = *saddr; iph->daddr = *daddr; return iph; } static void synproxy_send_tcp_ipv6(struct net *net, const struct sk_buff *skb, struct sk_buff *nskb, struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, struct ipv6hdr *niph, struct tcphdr *nth, unsigned int tcp_hdr_size) { struct dst_entry *dst; struct flowi6 fl6; int err; nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum_start = (unsigned char *)nth - nskb->head; nskb->csum_offset = offsetof(struct tcphdr, check); memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; fl6.saddr = niph->saddr; fl6.daddr = niph->daddr; fl6.fl6_sport = nth->source; fl6.fl6_dport = nth->dest; security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi_common(&fl6)); err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); if (err) { goto free_nskb; } dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) goto free_nskb; skb_dst_set(nskb, dst); if (nfct) { nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); nf_conntrack_get(nfct); } ip6_local_out(net, nskb->sk, nskb); return; free_nskb: kfree_skb(nskb); } void synproxy_send_client_synack_ipv6(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; struct ipv6hdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; u16 mss = opts->mss_encode; iph = ipv6_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->dest; nth->dest = th->source; nth->seq = htonl(nf_ipv6_cookie_init_sequence(iph, th, &mss)); nth->ack_seq = htonl(ntohl(th->seq) + 1); tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; if (opts->options & NF_SYNPROXY_OPT_ECN) tcp_flag_word(nth) |= TCP_FLAG_ECE; nth->doff = tcp_hdr_size / 4; nth->window = 0; nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb), IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } EXPORT_SYMBOL_GPL(synproxy_send_client_synack_ipv6); static void synproxy_send_server_syn_ipv6(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts, u32 recv_seq) { struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *nskb; struct ipv6hdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; iph = ipv6_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->source; nth->dest = th->dest; nth->seq = htonl(recv_seq - 1); /* ack_seq is used to relay our ISN to the synproxy hook to initialize * sequence number translation once a connection tracking entry exists. */ nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); tcp_flag_word(nth) = TCP_FLAG_SYN; if (opts->options & NF_SYNPROXY_OPT_ECN) tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; nth->doff = tcp_hdr_size / 4; nth->window = th->window; nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp_ipv6(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, niph, nth, tcp_hdr_size); } static void synproxy_send_server_ack_ipv6(struct net *net, const struct ip_ct_tcp *state, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; struct ipv6hdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; iph = ipv6_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->dest; nth->dest = th->source; nth->seq = htonl(ntohl(th->ack_seq)); nth->ack_seq = htonl(ntohl(th->seq) + 1); tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp_ipv6(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); } static void synproxy_send_client_ack_ipv6(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; struct ipv6hdr *iph, *niph; struct tcphdr *nth; unsigned int tcp_hdr_size; iph = ipv6_hdr(skb); tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, GFP_ATOMIC); if (!nskb) return; skb_reserve(nskb, MAX_TCP_HEADER); niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr); skb_reset_transport_header(nskb); nth = skb_put(nskb, tcp_hdr_size); nth->source = th->source; nth->dest = th->dest; nth->seq = htonl(ntohl(th->seq) + 1); nth->ack_seq = th->ack_seq; tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; nth->window = htons(ntohs(th->window) >> opts->wscale); nth->check = 0; nth->urg_ptr = 0; synproxy_build_options(nth, opts); synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb), IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } bool synproxy_recv_client_ack_ipv6(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, struct synproxy_options *opts, u32 recv_seq) { struct synproxy_net *snet = synproxy_pernet(net); int mss; mss = nf_cookie_v6_check(ipv6_hdr(skb), th); if (mss == 0) { this_cpu_inc(snet->stats->cookie_invalid); return false; } this_cpu_inc(snet->stats->cookie_valid); opts->mss_option = mss; opts->options |= NF_SYNPROXY_OPT_MSS; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy_check_timestamp_cookie(opts); synproxy_send_server_syn_ipv6(net, skb, th, opts, recv_seq); return true; } EXPORT_SYMBOL_GPL(synproxy_recv_client_ack_ipv6); unsigned int ipv6_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs) { struct net *net = nhs->net; struct synproxy_net *snet = synproxy_pernet(net); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; struct synproxy_options opts = {}; const struct ip_ct_tcp *state; struct tcphdr *th, _th; __be16 frag_off; u8 nexthdr; int thoff; ct = nf_ct_get(skb, &ctinfo); if (!ct) return NF_ACCEPT; synproxy = nfct_synproxy(ct); if (!synproxy) return NF_ACCEPT; if (nf_is_loopback_packet(skb)) return NF_ACCEPT; nexthdr = ipv6_hdr(skb)->nexthdr; thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (thoff < 0 || nexthdr != IPPROTO_TCP) return NF_ACCEPT; th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); if (!th) return NF_DROP; state = &ct->proto.tcp; switch (state->state) { case TCP_CONNTRACK_CLOSE: if (th->rst && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq) + 1); break; } if (!th->syn || th->ack || CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) break; /* Reopened connection - reset the sequence number and timestamp * adjustments, they will get initialized once the connection is * reestablished. */ nf_ct_seqadj_init(ct, ctinfo, 0); synproxy->tsoff = 0; this_cpu_inc(snet->stats->conn_reopened); fallthrough; case TCP_CONNTRACK_SYN_SENT: if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; if (!th->syn && th->ack && CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, * therefore we need to add 1 to make the SYN sequence * number match the one of first SYN. */ if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, ntohl(th->seq) + 1)) { this_cpu_inc(snet->stats->cookie_retrans); consume_skb(skb); return NF_STOLEN; } else { return NF_DROP; } } synproxy->isn = ntohl(th->ack_seq); if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy->its = opts.tsecr; nf_conntrack_event_cache(IPCT_SYNPROXY, ct); break; case TCP_CONNTRACK_SYN_RECV: if (!th->syn || !th->ack) break; if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) { synproxy->tsoff = opts.tsval - synproxy->its; nf_conntrack_event_cache(IPCT_SYNPROXY, ct); } opts.options &= ~(NF_SYNPROXY_OPT_MSS | NF_SYNPROXY_OPT_WSCALE | NF_SYNPROXY_OPT_SACK_PERM); swap(opts.tsval, opts.tsecr); synproxy_send_server_ack_ipv6(net, state, skb, th, &opts); nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); nf_conntrack_event_cache(IPCT_SEQADJ, ct); swap(opts.tsval, opts.tsecr); synproxy_send_client_ack_ipv6(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; default: break; } synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); return NF_ACCEPT; } EXPORT_SYMBOL_GPL(ipv6_synproxy_hook); static const struct nf_hook_ops ipv6_synproxy_ops[] = { { .hook = ipv6_synproxy_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, }, { .hook = ipv6_synproxy_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, }, }; int nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net) { int err; if (snet->hook_ref6 == 0) { err = nf_register_net_hooks(net, ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); if (err) return err; } snet->hook_ref6++; return 0; } EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_init); void nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net) { snet->hook_ref6--; if (snet->hook_ref6 == 0) nf_unregister_net_hooks(net, ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); } EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini); #endif /* CONFIG_IPV6 */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("nftables SYNPROXY expression support");
48 45 1 82 24 12 82 13 4 166 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Checksumming functions for IP, TCP, UDP and so on * * Authors: Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Borrows very liberally from tcp.c and ip.c, see those * files for more names. */ #ifndef _CHECKSUM_H #define _CHECKSUM_H #include <linux/errno.h> #include <asm/types.h> #include <asm/byteorder.h> #include <asm/checksum.h> #if !defined(_HAVE_ARCH_COPY_AND_CSUM_FROM_USER) || !defined(HAVE_CSUM_COPY_USER) #include <linux/uaccess.h> #endif #ifndef _HAVE_ARCH_COPY_AND_CSUM_FROM_USER static __always_inline __wsum csum_and_copy_from_user (const void __user *src, void *dst, int len) { if (copy_from_user(dst, src, len)) return 0; return csum_partial(dst, len, ~0U); } #endif #ifndef HAVE_CSUM_COPY_USER static __always_inline __wsum csum_and_copy_to_user (const void *src, void __user *dst, int len) { __wsum sum = csum_partial(src, len, ~0U); if (copy_to_user(dst, src, len) == 0) return sum; return 0; } #endif #ifndef _HAVE_ARCH_CSUM_AND_COPY static __always_inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len) { memcpy(dst, src, len); return csum_partial(dst, len, 0); } #endif #ifndef HAVE_ARCH_CSUM_ADD static __always_inline __wsum csum_add(__wsum csum, __wsum addend) { u32 res = (__force u32)csum; res += (__force u32)addend; return (__force __wsum)(res + (res < (__force u32)addend)); } #endif static __always_inline __wsum csum_sub(__wsum csum, __wsum addend) { return csum_add(csum, ~addend); } static __always_inline __sum16 csum16_add(__sum16 csum, __be16 addend) { u16 res = (__force u16)csum; res += (__force u16)addend; return (__force __sum16)(res + (res < (__force u16)addend)); } static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend) { return csum16_add(csum, ~addend); } #ifndef HAVE_ARCH_CSUM_SHIFT static __always_inline __wsum csum_shift(__wsum sum, int offset) { /* rotate sum to align it with a 16b boundary */ if (offset & 1) return (__force __wsum)ror32((__force u32)sum, 8); return sum; } #endif static __always_inline __wsum csum_block_add(__wsum csum, __wsum csum2, int offset) { return csum_add(csum, csum_shift(csum2, offset)); } static __always_inline __wsum csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) { return csum_block_add(csum, csum2, offset); } static __always_inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { return csum_block_add(csum, ~csum2, offset); } static __always_inline __wsum csum_unfold(__sum16 n) { return (__force __wsum)n; } static __always_inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum) { return csum_partial(buff, len, sum); } #define CSUM_MANGLED_0 ((__force __sum16)0xffff) static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) { *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); } static __always_inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) { __wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from); *sum = csum_fold(csum_add(tmp, (__force __wsum)to)); } /* Implements RFC 1624 (Incremental Internet Checksum) * 3. Discussion states : * HC' = ~(~HC + ~m + m') * m : old value of a 16bit field * m' : new value of a 16bit field */ static __always_inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) { *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } static inline void csum_replace(__wsum *csum, __wsum old, __wsum new) { *csum = csum_add(csum_sub(*csum, old), new); } struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr); void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, __wsum diff, bool pseudohdr); static __always_inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, __be16 from, __be16 to, bool pseudohdr) { inet_proto_csum_replace4(sum, skb, (__force __be32)from, (__force __be32)to, pseudohdr); } static __always_inline __wsum remcsum_adjust(void *ptr, __wsum csum, int start, int offset) { __sum16 *psum = (__sum16 *)(ptr + offset); __wsum delta; /* Subtract out checksum up to start */ csum = csum_sub(csum, csum_partial(ptr, start, 0)); /* Set derived checksum in packet */ delta = csum_sub((__force __wsum)csum_fold(csum), (__force __wsum)*psum); *psum = csum_fold(csum); return delta; } static __always_inline void remcsum_unadjust(__sum16 *psum, __wsum delta) { *psum = csum_fold(csum_sub(delta, (__force __wsum)*psum)); } static __always_inline __wsum wsum_negate(__wsum val) { return (__force __wsum)-((__force u32)val); } #endif
19 10 21 25 21 25 25 17 16 3 14 1 22 10 1 10 18 18 18 18 18 18 18 17 10 10 10 10 831 7 11 13 14 1590 7 1591 9 5 8 12 8 8 8 1 8 25 2 2 2 2 12 1 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 /* SPDX-License-Identifier: GPL-2.0 */ /* * Filesystem access notification for Linux * * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #ifndef __LINUX_FSNOTIFY_BACKEND_H #define __LINUX_FSNOTIFY_BACKEND_H #ifdef __KERNEL__ #include <linux/idr.h> /* inotify uses this */ #include <linux/fs.h> /* struct inode */ #include <linux/list.h> #include <linux/path.h> /* struct path */ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/user_namespace.h> #include <linux/refcount.h> #include <linux/mempool.h> #include <linux/sched/mm.h> /* * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily * convert between them. dnotify only needs conversion at watch creation * so no perf loss there. fanotify isn't defined yet, so it can use the * wholes if it needs more events. */ #define FS_ACCESS 0x00000001 /* File was accessed */ #define FS_MODIFY 0x00000002 /* File was modified */ #define FS_ATTRIB 0x00000004 /* Metadata changed */ #define FS_CLOSE_WRITE 0x00000008 /* Writable file was closed */ #define FS_CLOSE_NOWRITE 0x00000010 /* Unwritable file closed */ #define FS_OPEN 0x00000020 /* File was opened */ #define FS_MOVED_FROM 0x00000040 /* File was moved from X */ #define FS_MOVED_TO 0x00000080 /* File was moved to Y */ #define FS_CREATE 0x00000100 /* Subfile was created */ #define FS_DELETE 0x00000200 /* Subfile was deleted */ #define FS_DELETE_SELF 0x00000400 /* Self was deleted */ #define FS_MOVE_SELF 0x00000800 /* Self was moved */ #define FS_OPEN_EXEC 0x00001000 /* File was opened for exec */ #define FS_UNMOUNT 0x00002000 /* inode on umount fs */ #define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ #define FS_ERROR 0x00008000 /* Filesystem Error (fanotify) */ /* * FS_IN_IGNORED overloads FS_ERROR. It is only used internally by inotify * which does not support FS_ERROR. */ #define FS_IN_IGNORED 0x00008000 /* last inotify event here */ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ /* * Set on inode mark that cares about things that happen to its children. * Always set for dnotify and inotify. * Set on inode/sb/mount marks that care about parent/name info. */ #define FS_EVENT_ON_CHILD 0x08000000 #define FS_RENAME 0x10000000 /* File was renamed */ #define FS_DN_MULTISHOT 0x20000000 /* dnotify multishot */ #define FS_ISDIR 0x40000000 /* event occurred against dir */ #define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO) /* * Directory entry modification events - reported only to directory * where entry is modified and not to a watching parent. * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event * when a directory entry inside a child subdir changes. */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ FS_OPEN_EXEC_PERM) /* * This is a list of all events that may get sent to a parent that is watching * with flag FS_EVENT_ON_CHILD based on fs event on a child of that directory. */ #define FS_EVENTS_POSS_ON_CHILD (ALL_FSNOTIFY_PERM_EVENTS | \ FS_ACCESS | FS_MODIFY | FS_ATTRIB | \ FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \ FS_OPEN | FS_OPEN_EXEC) /* * This is a list of all events that may get sent with the parent inode as the * @to_tell argument of fsnotify(). * It may include events that can be sent to an inode/sb/mount mark, but cannot * be sent to a parent watching children. */ #define FS_EVENTS_POSS_TO_PARENT (FS_EVENTS_POSS_ON_CHILD) /* Events that can be reported to backends */ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \ FS_EVENTS_POSS_ON_CHILD | \ FS_DELETE_SELF | FS_MOVE_SELF | \ FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \ FS_ERROR) /* Extra flags that may be reported with event or control handling of events */ #define ALL_FSNOTIFY_FLAGS (FS_ISDIR | FS_EVENT_ON_CHILD | FS_DN_MULTISHOT) #define ALL_FSNOTIFY_BITS (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS) struct fsnotify_group; struct fsnotify_event; struct fsnotify_mark; struct fsnotify_event_private_data; struct fsnotify_fname; struct fsnotify_iter_info; struct mem_cgroup; /* * Each group much define these ops. The fsnotify infrastructure will call * these operations for each relevant group. * * handle_event - main call for a group to handle an fs event * @group: group to notify * @mask: event type and flags * @data: object that event happened on * @data_type: type of object for fanotify_data_XXX() accessors * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to * @file_name: optional file name associated with event * @cookie: inotify rename cookie * @iter_info: array of marks from this group that are interested in the event * * handle_inode_event - simple variant of handle_event() for groups that only * have inode marks and don't have ignore mask * @mark: mark to notify * @mask: event type and flags * @inode: inode that event happened on * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to. * Either @inode or @dir must be non-NULL. * @file_name: optional file name associated with event * @cookie: inotify rename cookie * * free_group_priv - called when a group refcnt hits 0 to clean up the private union * freeing_mark - called when a mark is being destroyed for some reason. The group * MUST be holding a reference on each mark and that reference must be * dropped in this function. inotify uses this function to send * userspace messages that marks have been removed. */ struct fsnotify_ops { int (*handle_event)(struct fsnotify_group *group, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info); int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *file_name, u32 cookie); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event); /* called on final put+free to free memory */ void (*free_mark)(struct fsnotify_mark *mark); }; /* * all of the information about the original object we want to now send to * a group. If you want to carry more info from the accessing task to the * listener this structure is where you need to be adding fields. */ struct fsnotify_event { struct list_head list; }; /* * fsnotify group priorities. * Events are sent in order from highest priority to lowest priority. */ enum fsnotify_group_prio { FSNOTIFY_PRIO_NORMAL = 0, /* normal notifiers, no permissions */ FSNOTIFY_PRIO_CONTENT, /* fanotify permission events */ FSNOTIFY_PRIO_PRE_CONTENT, /* fanotify pre-content events */ __FSNOTIFY_PRIO_NUM }; /* * A group is a "thing" that wants to receive notification about filesystem * events. The mask holds the subset of event types this group cares about. * refcnt on a group is up to the implementor and at any moment if it goes 0 * everything will be cleaned up. */ struct fsnotify_group { const struct fsnotify_ops *ops; /* how this group handles things */ /* * How the refcnt is used is up to each group. When the refcnt hits 0 * fsnotify will clean up all of the resources associated with this group. * As an example, the dnotify group will always have a refcnt=1 and that * will never change. Inotify, on the other hand, has a group per * inotify_init() and the refcnt will hit 0 only when that fd has been * closed. */ refcount_t refcnt; /* things with interest in this group */ /* needed to send notification to userspace */ spinlock_t notification_lock; /* protect the notification_list */ struct list_head notification_list; /* list of event_holder this group needs to send to userspace */ wait_queue_head_t notification_waitq; /* read() on the notification file blocks on this waitq */ unsigned int q_len; /* events on the queue */ unsigned int max_events; /* maximum events allowed on the list */ enum fsnotify_group_prio priority; /* priority for sending events */ bool shutdown; /* group is being shut down, don't queue more events */ #define FSNOTIFY_GROUP_USER 0x01 /* user allocated group */ #define FSNOTIFY_GROUP_DUPS 0x02 /* allow multiple marks per object */ #define FSNOTIFY_GROUP_NOFS 0x04 /* group lock is not direct reclaim safe */ int flags; unsigned int owner_flags; /* stored flags of mark_mutex owner */ /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ struct mutex mark_mutex; /* protect marks_list */ atomic_t user_waits; /* Number of tasks waiting for user * response */ struct list_head marks_list; /* all inode marks for this group */ struct fasync_struct *fsn_fa; /* async notification */ struct fsnotify_event *overflow_event; /* Event we queue when the * notification list is too * full */ struct mem_cgroup *memcg; /* memcg to charge allocations */ /* groups can define private fields here or use the void *private */ union { void *private; #ifdef CONFIG_INOTIFY_USER struct inotify_group_private_data { spinlock_t idr_lock; struct idr idr; struct ucounts *ucounts; } inotify_data; #endif #ifdef CONFIG_FANOTIFY struct fanotify_group_private_data { /* Hash table of events for merge */ struct hlist_head *merge_hash; /* allows a group to block waiting for a userspace response */ struct list_head access_list; wait_queue_head_t access_waitq; int flags; /* flags from fanotify_init() */ int f_flags; /* event_f_flags from fanotify_init() */ struct ucounts *ucounts; mempool_t error_events_pool; } fanotify_data; #endif /* CONFIG_FANOTIFY */ }; }; /* * These helpers are used to prevent deadlock when reclaiming inodes with * evictable marks of the same group that is allocating a new mark. */ static inline void fsnotify_group_lock(struct fsnotify_group *group) { mutex_lock(&group->mark_mutex); if (group->flags & FSNOTIFY_GROUP_NOFS) group->owner_flags = memalloc_nofs_save(); } static inline void fsnotify_group_unlock(struct fsnotify_group *group) { if (group->flags & FSNOTIFY_GROUP_NOFS) memalloc_nofs_restore(group->owner_flags); mutex_unlock(&group->mark_mutex); } static inline void fsnotify_group_assert_locked(struct fsnotify_group *group) { WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex)); if (group->flags & FSNOTIFY_GROUP_NOFS) WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); } /* When calling fsnotify tell it if the data is a path or inode */ enum fsnotify_data_type { FSNOTIFY_EVENT_NONE, FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, FSNOTIFY_EVENT_DENTRY, FSNOTIFY_EVENT_ERROR, }; struct fs_error_report { int error; struct inode *inode; struct super_block *sb; }; static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_INODE: return (struct inode *)data; case FSNOTIFY_EVENT_DENTRY: return d_inode(data); case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *)data)->inode; default: return NULL; } } static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_DENTRY: /* Non const is needed for dget() */ return (struct dentry *)data; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry; default: return NULL; } } static inline const struct path *fsnotify_data_path(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_PATH: return data; default: return NULL; } } static inline struct super_block *fsnotify_data_sb(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_INODE: return ((struct inode *)data)->i_sb; case FSNOTIFY_EVENT_DENTRY: return ((struct dentry *)data)->d_sb; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry->d_sb; case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *) data)->sb; default: return NULL; } } static inline struct fs_error_report *fsnotify_data_error_report( const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_ERROR: return (struct fs_error_report *) data; default: return NULL; } } /* * Index to merged marks iterator array that correlates to a type of watch. * The type of watched object can be deduced from the iterator type, but not * the other way around, because an event can match different watched objects * of the same object type. * For example, both parent and child are watching an object of type inode. */ enum fsnotify_iter_type { FSNOTIFY_ITER_TYPE_INODE, FSNOTIFY_ITER_TYPE_VFSMOUNT, FSNOTIFY_ITER_TYPE_SB, FSNOTIFY_ITER_TYPE_PARENT, FSNOTIFY_ITER_TYPE_INODE2, FSNOTIFY_ITER_TYPE_COUNT }; /* The type of object that a mark is attached to */ enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_ANY = -1, FSNOTIFY_OBJ_TYPE_INODE, FSNOTIFY_OBJ_TYPE_VFSMOUNT, FSNOTIFY_OBJ_TYPE_SB, FSNOTIFY_OBJ_TYPE_COUNT, FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT }; static inline bool fsnotify_valid_obj_type(unsigned int obj_type) { return (obj_type < FSNOTIFY_OBJ_TYPE_COUNT); } struct fsnotify_iter_info { struct fsnotify_mark *marks[FSNOTIFY_ITER_TYPE_COUNT]; struct fsnotify_group *current_group; unsigned int report_mask; int srcu_idx; }; static inline bool fsnotify_iter_should_report_type( struct fsnotify_iter_info *iter_info, int iter_type) { return (iter_info->report_mask & (1U << iter_type)); } static inline void fsnotify_iter_set_report_type( struct fsnotify_iter_info *iter_info, int iter_type) { iter_info->report_mask |= (1U << iter_type); } static inline struct fsnotify_mark *fsnotify_iter_mark( struct fsnotify_iter_info *iter_info, int iter_type) { if (fsnotify_iter_should_report_type(iter_info, iter_type)) return iter_info->marks[iter_type]; return NULL; } static inline int fsnotify_iter_step(struct fsnotify_iter_info *iter, int type, struct fsnotify_mark **markp) { while (type < FSNOTIFY_ITER_TYPE_COUNT) { *markp = fsnotify_iter_mark(iter, type); if (*markp) break; type++; } return type; } #define FSNOTIFY_ITER_FUNCS(name, NAME) \ static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \ struct fsnotify_iter_info *iter_info) \ { \ return fsnotify_iter_mark(iter_info, FSNOTIFY_ITER_TYPE_##NAME); \ } FSNOTIFY_ITER_FUNCS(inode, INODE) FSNOTIFY_ITER_FUNCS(parent, PARENT) FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT) FSNOTIFY_ITER_FUNCS(sb, SB) #define fsnotify_foreach_iter_type(type) \ for (type = 0; type < FSNOTIFY_ITER_TYPE_COUNT; type++) #define fsnotify_foreach_iter_mark_type(iter, mark, type) \ for (type = 0; \ type = fsnotify_iter_step(iter, type, &mark), \ type < FSNOTIFY_ITER_TYPE_COUNT; \ type++) /* * Inode/vfsmount/sb point to this structure which tracks all marks attached to * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this * structure. We destroy this structure when there are no more marks attached * to it. The structure is protected by fsnotify_mark_srcu. */ struct fsnotify_mark_connector { spinlock_t lock; unsigned char type; /* Type of object [lock] */ unsigned char prio; /* Highest priority group */ #define FSNOTIFY_CONN_FLAG_IS_WATCHED 0x01 #define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02 unsigned short flags; /* flags [lock] */ union { /* Object pointer [lock] */ void *obj; /* Used listing heads to free after srcu period expires */ struct fsnotify_mark_connector *destroy_next; }; struct hlist_head list; }; /* * Container for per-sb fsnotify state (sb marks and more). * Attached lazily on first marked object on the sb and freed when killing sb. */ struct fsnotify_sb_info { struct fsnotify_mark_connector __rcu *sb_marks; /* * Number of inode/mount/sb objects that are being watched in this sb. * Note that inodes objects are currently double-accounted. * * The value in watched_objects[prio] is the number of objects that are * watched by groups of priority >= prio, so watched_objects[0] is the * total number of watched objects in this sb. */ atomic_long_t watched_objects[__FSNOTIFY_PRIO_NUM]; }; static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb) { #ifdef CONFIG_FSNOTIFY return READ_ONCE(sb->s_fsnotify_info); #else return NULL; #endif } static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb) { return &fsnotify_sb_info(sb)->watched_objects[0]; } /* * A mark is simply an object attached to an in core inode which allows an * fsnotify listener to indicate they are either no longer interested in events * of a type matching mask or only interested in those events. * * These are flushed when an inode is evicted from core and may be flushed * when the inode is modified (as seen by fsnotify_access). Some fsnotify * users (such as dnotify) will flush these when the open fd is closed and not * at inode eviction or modification. * * Text in brackets is showing the lock(s) protecting modifications of a * particular entry. obj_lock means either inode->i_lock or * mnt->mnt_root->d_lock depending on the mark type. */ struct fsnotify_mark { /* Mask this mark is for [mark->lock, group->mark_mutex] */ __u32 mask; /* We hold one for presence in g_list. Also one ref for each 'thing' * in kernel that found and may be using this mark. */ refcount_t refcnt; /* Group this mark is for. Set on mark creation, stable until last ref * is dropped */ struct fsnotify_group *group; /* List of marks by group->marks_list. Also reused for queueing * mark into destroy_list when it's waiting for the end of SRCU period * before it can be freed. [group->mark_mutex] */ struct list_head g_list; /* Protects inode / mnt pointers, flags, masks */ spinlock_t lock; /* List of marks for inode / vfsmount [connector->lock, mark ref] */ struct hlist_node obj_list; /* Head of list of marks for an object [mark ref] */ struct fsnotify_mark_connector *connector; /* Events types and flags to ignore [mark->lock, group->mark_mutex] */ __u32 ignore_mask; /* General fsnotify mark flags */ #define FSNOTIFY_MARK_FLAG_ALIVE 0x0001 #define FSNOTIFY_MARK_FLAG_ATTACHED 0x0002 /* inotify mark flags */ #define FSNOTIFY_MARK_FLAG_EXCL_UNLINK 0x0010 #define FSNOTIFY_MARK_FLAG_IN_ONESHOT 0x0020 /* fanotify mark flags */ #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100 #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400 #define FSNOTIFY_MARK_FLAG_HAS_FSID 0x0800 #define FSNOTIFY_MARK_FLAG_WEAK_FSID 0x1000 unsigned int flags; /* flags [mark->lock] */ }; #ifdef CONFIG_FSNOTIFY /* called from the vfs helpers */ /* main fsnotify call to send events */ extern int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie); extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type); extern void __fsnotify_inode_delete(struct inode *inode); extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt); extern void fsnotify_sb_delete(struct super_block *sb); extern void fsnotify_sb_free(struct super_block *sb); extern u32 fsnotify_get_cookie(void); static inline __u32 fsnotify_parent_needed_mask(__u32 mask) { /* FS_EVENT_ON_CHILD is set on marks that want parent/name info */ if (!(mask & FS_EVENT_ON_CHILD)) return 0; /* * This object might be watched by a mark that cares about parent/name * info, does it care about the specific set of events that can be * reported with parent/name info? */ return mask & FS_EVENTS_POSS_TO_PARENT; } static inline int fsnotify_inode_watches_children(struct inode *inode) { /* FS_EVENT_ON_CHILD is set if the inode may care */ if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD)) return 0; /* this inode might care about child events, does it care about the * specific set of events that can happen on a child? */ return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD; } /* * Update the dentry with a flag indicating the interest of its parent to receive * filesystem events when those events happens to this dentry->d_inode. */ static inline void fsnotify_update_flags(struct dentry *dentry) { assert_spin_locked(&dentry->d_lock); /* * Serialisation of setting PARENT_WATCHED on the dentries is provided * by d_lock. If inotify_inode_watched changes after we have taken * d_lock, the following __fsnotify_update_child_dentry_flags call will * find our entry, so it will spin until we complete here, and update * us with the new state. */ if (fsnotify_inode_watches_children(dentry->d_parent->d_inode)) dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; else dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; } /* called from fsnotify listeners, such as fanotify or dnotify */ /* create a new group */ extern struct fsnotify_group *fsnotify_alloc_group( const struct fsnotify_ops *ops, int flags); /* get reference to a group */ extern void fsnotify_get_group(struct fsnotify_group *group); /* drop reference on a group from fsnotify_alloc_group */ extern void fsnotify_put_group(struct fsnotify_group *group); /* group destruction begins, stop queuing new events */ extern void fsnotify_group_stop_queueing(struct fsnotify_group *group); /* destroy group */ extern void fsnotify_destroy_group(struct fsnotify_group *group); /* fasync handler function */ extern int fsnotify_fasync(int fd, struct file *file, int on); /* Free event from memory */ extern void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event); /* attach the event to the group notification queue */ extern int fsnotify_insert_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct fsnotify_group *, struct fsnotify_event *), void (*insert)(struct fsnotify_group *, struct fsnotify_event *)); static inline int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct fsnotify_group *, struct fsnotify_event *)) { return fsnotify_insert_event(group, event, merge, NULL); } /* Queue overflow event to a notification group */ static inline void fsnotify_queue_overflow(struct fsnotify_group *group) { fsnotify_add_event(group, group->overflow_event, NULL); } static inline bool fsnotify_is_overflow_event(u32 mask) { return mask & FS_Q_OVERFLOW; } static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) { assert_spin_locked(&group->notification_lock); return list_empty(&group->notification_list); } extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); /* return, but do not dequeue the first event on the notification queue */ extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group); /* return AND dequeue the first event on the notification queue */ extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group); /* Remove event queued in the notification list */ extern void fsnotify_remove_queued_event(struct fsnotify_group *group, struct fsnotify_event *event); /* functions used to manipulate the marks attached to inodes */ /* * Canonical "ignore mask" including event flags. * * Note the subtle semantic difference from the legacy ->ignored_mask. * ->ignored_mask traditionally only meant which events should be ignored, * while ->ignore_mask also includes flags regarding the type of objects on * which events should be ignored. */ static inline __u32 fsnotify_ignore_mask(struct fsnotify_mark *mark) { __u32 ignore_mask = mark->ignore_mask; /* The event flags in ignore mask take effect */ if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) return ignore_mask; /* * Legacy behavior: * - Always ignore events on dir * - Ignore events on child if parent is watching children */ ignore_mask |= FS_ISDIR; ignore_mask &= ~FS_EVENT_ON_CHILD; ignore_mask |= mark->mask & FS_EVENT_ON_CHILD; return ignore_mask; } /* Legacy ignored_mask - only event types to ignore */ static inline __u32 fsnotify_ignored_events(struct fsnotify_mark *mark) { return mark->ignore_mask & ALL_FSNOTIFY_EVENTS; } /* * Check if mask (or ignore mask) should be applied depending if victim is a * directory and whether it is reported to a watching parent. */ static inline bool fsnotify_mask_applicable(__u32 mask, bool is_dir, int iter_type) { /* Should mask be applied to a directory? */ if (is_dir && !(mask & FS_ISDIR)) return false; /* Should mask be applied to a child? */ if (iter_type == FSNOTIFY_ITER_TYPE_PARENT && !(mask & FS_EVENT_ON_CHILD)) return false; return true; } /* * Effective ignore mask taking into account if event victim is a * directory and whether it is reported to a watching parent. */ static inline __u32 fsnotify_effective_ignore_mask(struct fsnotify_mark *mark, bool is_dir, int iter_type) { __u32 ignore_mask = fsnotify_ignored_events(mark); if (!ignore_mask) return 0; /* For non-dir and non-child, no need to consult the event flags */ if (!is_dir && iter_type != FSNOTIFY_ITER_TYPE_PARENT) return ignore_mask; ignore_mask = fsnotify_ignore_mask(mark); if (!fsnotify_mask_applicable(ignore_mask, is_dir, iter_type)) return 0; return ignore_mask & ALL_FSNOTIFY_EVENTS; } /* Get mask for calculating object interest taking ignore mask into account */ static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark) { __u32 mask = mark->mask; if (!fsnotify_ignored_events(mark)) return mask; /* Interest in FS_MODIFY may be needed for clearing ignore mask */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mask |= FS_MODIFY; /* * If mark is interested in ignoring events on children, the object must * show interest in those events for fsnotify_parent() to notice it. */ return mask | mark->ignore_mask; } /* Get mask of events for a list of marks */ extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn); /* Calculate mask of events for a list of marks */ extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn); extern void fsnotify_init_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* Find mark belonging to given group in the list of marks */ struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type, struct fsnotify_group *group); /* attach the mark to the object */ int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags); int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags); /* attach the mark to the inode */ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { return fsnotify_add_mark(mark, inode, FSNOTIFY_OBJ_TYPE_INODE, add_flags); } static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { return fsnotify_add_mark_locked(mark, inode, FSNOTIFY_OBJ_TYPE_INODE, add_flags); } static inline struct fsnotify_mark *fsnotify_find_inode_mark( struct inode *inode, struct fsnotify_group *group) { return fsnotify_find_mark(inode, FSNOTIFY_OBJ_TYPE_INODE, group); } /* given a group and a mark, flag mark to be freed when all references are dropped */ extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* detach mark from inode / mount list, group list, drop inode reference */ extern void fsnotify_detach_mark(struct fsnotify_mark *mark); /* free mark */ extern void fsnotify_free_mark(struct fsnotify_mark *mark); /* Wait until all marks queued for destruction are destroyed */ extern void fsnotify_wait_marks_destroyed(void); /* Clear all of the marks of a group attached to a given object type */ extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int obj_type); /* run all the marks in a group, and clear all of the vfsmount marks */ static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) { fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT); } /* run all the marks in a group, and clear all of the inode marks */ static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) { fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE); } /* run all the marks in a group, and clear all of the sn marks */ static inline void fsnotify_clear_sb_marks_by_group(struct fsnotify_group *group) { fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB); } extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info); static inline void fsnotify_init_event(struct fsnotify_event *event) { INIT_LIST_HEAD(&event->list); } #else static inline int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie) { return 0; } static inline int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { return 0; } static inline void __fsnotify_inode_delete(struct inode *inode) {} static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt) {} static inline void fsnotify_sb_delete(struct super_block *sb) {} static inline void fsnotify_sb_free(struct super_block *sb) {} static inline void fsnotify_update_flags(struct dentry *dentry) {} static inline u32 fsnotify_get_cookie(void) { return 0; } static inline void fsnotify_unmount_inodes(struct super_block *sb) {} #endif /* CONFIG_FSNOTIFY */ #endif /* __KERNEL __ */ #endif /* __LINUX_FSNOTIFY_BACKEND_H */
6 6 6 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 /* inflate.c -- zlib decompression * Copyright (C) 1995-2005 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * * Based on zlib 1.2.3 but modified for the Linux Kernel by * Richard Purdie <richard@openedhand.com> * * Changes mainly for static instead of dynamic memory allocation * */ #include <linux/zutil.h> #include "inftrees.h" #include "inflate.h" #include "inffast.h" #include "infutil.h" /* architecture-specific bits */ #ifdef CONFIG_ZLIB_DFLTCC # include "../zlib_dfltcc/dfltcc_inflate.h" #else #define INFLATE_RESET_HOOK(strm) do {} while (0) #define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0) #define INFLATE_NEED_UPDATEWINDOW(strm) 1 #define INFLATE_NEED_CHECKSUM(strm) 1 #endif int zlib_inflate_workspacesize(void) { return sizeof(struct inflate_workspace); } int zlib_inflateReset(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; strm->total_in = strm->total_out = state->total = 0; strm->msg = NULL; strm->adler = 1; /* to support ill-conceived Java test suite */ state->mode = HEAD; state->last = 0; state->havedict = 0; state->dmax = 32768U; state->hold = 0; state->bits = 0; state->lencode = state->distcode = state->next = state->codes; /* Initialise Window */ state->wsize = 1U << state->wbits; state->write = 0; state->whave = 0; INFLATE_RESET_HOOK(strm); return Z_OK; } int zlib_inflateInit2(z_streamp strm, int windowBits) { struct inflate_state *state; if (strm == NULL) return Z_STREAM_ERROR; strm->msg = NULL; /* in case we return an error */ state = &WS(strm)->inflate_state; strm->state = (struct internal_state *)state; if (windowBits < 0) { state->wrap = 0; windowBits = -windowBits; } else { state->wrap = (windowBits >> 4) + 1; } if (windowBits < 8 || windowBits > 15) { return Z_STREAM_ERROR; } state->wbits = (unsigned)windowBits; #ifdef CONFIG_ZLIB_DFLTCC /* * DFLTCC requires the window to be page aligned. * Thus, we overallocate and take the aligned portion of the buffer. */ state->window = PTR_ALIGN(&WS(strm)->working_window[0], PAGE_SIZE); #else state->window = &WS(strm)->working_window[0]; #endif return zlib_inflateReset(strm); } /* Return state with length and distance decoding tables and index sizes set to fixed code decoding. This returns fixed tables from inffixed.h. */ static void zlib_fixedtables(struct inflate_state *state) { # include "inffixed.h" state->lencode = lenfix; state->lenbits = 9; state->distcode = distfix; state->distbits = 5; } /* Update the window with the last wsize (normally 32K) bytes written before returning. This is only called when a window is already in use, or when output has been written during this inflate call, but the end of the deflate stream has not been reached yet. It is also called to window dictionary data when a dictionary is loaded. Providing output buffers larger than 32K to inflate() should provide a speed advantage, since only the last 32K of output is copied to the sliding window upon return from inflate(), and since all distances after the first 32K of output will fall in the output data, making match copies simpler and faster. The advantage may be dependent on the size of the processor's data caches. */ static void zlib_updatewindow(z_streamp strm, unsigned out) { struct inflate_state *state; unsigned copy, dist; state = (struct inflate_state *)strm->state; /* copy state->wsize or less output bytes into the circular window */ copy = out - strm->avail_out; if (copy >= state->wsize) { memcpy(state->window, strm->next_out - state->wsize, state->wsize); state->write = 0; state->whave = state->wsize; } else { dist = state->wsize - state->write; if (dist > copy) dist = copy; memcpy(state->window + state->write, strm->next_out - copy, dist); copy -= dist; if (copy) { memcpy(state->window, strm->next_out - copy, copy); state->write = copy; state->whave = state->wsize; } else { state->write += dist; if (state->write == state->wsize) state->write = 0; if (state->whave < state->wsize) state->whave += dist; } } } /* * At the end of a Deflate-compressed PPP packet, we expect to have seen * a `stored' block type value but not the (zero) length bytes. */ /* Returns true if inflate is currently at the end of a block generated by Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP implementation to provide an additional safety check. PPP uses Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored block. When decompressing, PPP checks that at the end of input packet, inflate is waiting for these length bytes. */ static int zlib_inflateSyncPacket(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == STORED && state->bits == 0) { state->mode = TYPE; return Z_OK; } return Z_DATA_ERROR; } /* Macros for inflate(): */ /* check function to use adler32() for zlib or crc32() for gzip */ #define UPDATE(check, buf, len) zlib_adler32(check, buf, len) /* Load registers with state in inflate() for speed */ #define LOAD() \ do { \ put = strm->next_out; \ left = strm->avail_out; \ next = strm->next_in; \ have = strm->avail_in; \ hold = state->hold; \ bits = state->bits; \ } while (0) /* Restore state from registers in inflate() */ #define RESTORE() \ do { \ strm->next_out = put; \ strm->avail_out = left; \ strm->next_in = next; \ strm->avail_in = have; \ state->hold = hold; \ state->bits = bits; \ } while (0) /* Clear the input bit accumulator */ #define INITBITS() \ do { \ hold = 0; \ bits = 0; \ } while (0) /* Get a byte of input into the bit accumulator, or return from inflate() if there is no input available. */ #define PULLBYTE() \ do { \ if (have == 0) goto inf_leave; \ have--; \ hold += (unsigned long)(*next++) << bits; \ bits += 8; \ } while (0) /* Assure that there are at least n bits in the bit accumulator. If there is not enough available input to do that, then return from inflate(). */ #define NEEDBITS(n) \ do { \ while (bits < (unsigned)(n)) \ PULLBYTE(); \ } while (0) /* Return the low n bits of the bit accumulator (n < 16) */ #define BITS(n) \ ((unsigned)hold & ((1U << (n)) - 1)) /* Remove n bits from the bit accumulator */ #define DROPBITS(n) \ do { \ hold >>= (n); \ bits -= (unsigned)(n); \ } while (0) /* Remove zero to seven bits as needed to go to a byte boundary */ #define BYTEBITS() \ do { \ hold >>= bits & 7; \ bits -= bits & 7; \ } while (0) /* inflate() uses a state machine to process as much input data and generate as much output data as possible before returning. The state machine is structured roughly as follows: for (;;) switch (state) { ... case STATEn: if (not enough input data or output space to make progress) return; ... make progress ... state = STATEm; break; ... } so when inflate() is called again, the same case is attempted again, and if the appropriate resources are provided, the machine proceeds to the next state. The NEEDBITS() macro is usually the way the state evaluates whether it can proceed or should return. NEEDBITS() does the return if the requested bits are not available. The typical use of the BITS macros is: NEEDBITS(n); ... do something with BITS(n) ... DROPBITS(n); where NEEDBITS(n) either returns from inflate() if there isn't enough input left to load n bits into the accumulator, or it continues. BITS(n) gives the low n bits in the accumulator. When done, DROPBITS(n) drops the low n bits off the accumulator. INITBITS() clears the accumulator and sets the number of available bits to zero. BYTEBITS() discards just enough bits to put the accumulator on a byte boundary. After BYTEBITS() and a NEEDBITS(8), then BITS(8) would return the next byte in the stream. NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return if there is no input available. The decoding of variable length codes uses PULLBYTE() directly in order to pull just enough bytes to decode the next code, and no more. Some states loop until they get enough input, making sure that enough state information is maintained to continue the loop where it left off if NEEDBITS() returns in the loop. For example, want, need, and keep would all have to actually be part of the saved state in case NEEDBITS() returns: case STATEw: while (want < need) { NEEDBITS(n); keep[want++] = BITS(n); DROPBITS(n); } state = STATEx; case STATEx: As shown above, if the next state is also the next case, then the break is omitted. A state may also return if there is not enough output space available to complete that state. Those states are copying stored data, writing a literal byte, and copying a matching string. When returning, a "goto inf_leave" is used to update the total counters, update the check value, and determine whether any progress has been made during that inflate() call in order to return the proper return code. Progress is defined as a change in either strm->avail_in or strm->avail_out. When there is a window, goto inf_leave will update the window with the last output written. If a goto inf_leave occurs in the middle of decompression and there is no window currently, goto inf_leave will create one and copy output to the window for the next call of inflate(). In this implementation, the flush parameter of inflate() only affects the return code (per zlib.h). inflate() always writes as much as possible to strm->next_out, given the space available and the provided input--the effect documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers the allocation of and copying into a sliding window until necessary, which provides the effect documented in zlib.h for Z_FINISH when the entire input stream available. So the only thing the flush parameter actually does is: when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it will return Z_BUF_ERROR if it has not reached the end of the stream. */ int zlib_inflate(z_streamp strm, int flush) { struct inflate_state *state; const unsigned char *next; /* next input */ unsigned char *put; /* next output */ unsigned have, left; /* available input and output */ unsigned long hold; /* bit buffer */ unsigned bits; /* bits in bit buffer */ unsigned in, out; /* save starting available input and output */ unsigned copy; /* number of stored or match bytes to copy */ unsigned char *from; /* where to copy match bytes from */ code this; /* current decoding table entry */ code last; /* parent table entry */ unsigned len; /* length to copy for repeats, bits to drop */ int ret; /* return code */ static const unsigned short order[19] = /* permutation of code lengths */ {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; /* Do not check for strm->next_out == NULL here as ppc zImage inflates to strm->next_out = 0 */ if (strm == NULL || strm->state == NULL || (strm->next_in == NULL && strm->avail_in != 0)) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */ LOAD(); in = have; out = left; ret = Z_OK; for (;;) switch (state->mode) { case HEAD: if (state->wrap == 0) { state->mode = TYPEDO; break; } NEEDBITS(16); if ( ((BITS(8) << 8) + (hold >> 8)) % 31) { strm->msg = (char *)"incorrect header check"; state->mode = BAD; break; } if (BITS(4) != Z_DEFLATED) { strm->msg = (char *)"unknown compression method"; state->mode = BAD; break; } DROPBITS(4); len = BITS(4) + 8; if (len > state->wbits) { strm->msg = (char *)"invalid window size"; state->mode = BAD; break; } state->dmax = 1U << len; strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = hold & 0x200 ? DICTID : TYPE; INITBITS(); break; case DICTID: NEEDBITS(32); strm->adler = state->check = REVERSE(hold); INITBITS(); state->mode = DICT; fallthrough; case DICT: if (state->havedict == 0) { RESTORE(); return Z_NEED_DICT; } strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = TYPE; fallthrough; case TYPE: if (flush == Z_BLOCK) goto inf_leave; fallthrough; case TYPEDO: INFLATE_TYPEDO_HOOK(strm, flush); if (state->last) { BYTEBITS(); state->mode = CHECK; break; } NEEDBITS(3); state->last = BITS(1); DROPBITS(1); switch (BITS(2)) { case 0: /* stored block */ state->mode = STORED; break; case 1: /* fixed block */ zlib_fixedtables(state); state->mode = LEN; /* decode codes */ break; case 2: /* dynamic block */ state->mode = TABLE; break; case 3: strm->msg = (char *)"invalid block type"; state->mode = BAD; } DROPBITS(2); break; case STORED: BYTEBITS(); /* go to byte boundary */ NEEDBITS(32); if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) { strm->msg = (char *)"invalid stored block lengths"; state->mode = BAD; break; } state->length = (unsigned)hold & 0xffff; INITBITS(); state->mode = COPY; fallthrough; case COPY: copy = state->length; if (copy) { if (copy > have) copy = have; if (copy > left) copy = left; if (copy == 0) goto inf_leave; memcpy(put, next, copy); have -= copy; next += copy; left -= copy; put += copy; state->length -= copy; break; } state->mode = TYPE; break; case TABLE: NEEDBITS(14); state->nlen = BITS(5) + 257; DROPBITS(5); state->ndist = BITS(5) + 1; DROPBITS(5); state->ncode = BITS(4) + 4; DROPBITS(4); #ifndef PKZIP_BUG_WORKAROUND if (state->nlen > 286 || state->ndist > 30) { strm->msg = (char *)"too many length or distance symbols"; state->mode = BAD; break; } #endif state->have = 0; state->mode = LENLENS; fallthrough; case LENLENS: while (state->have < state->ncode) { NEEDBITS(3); state->lens[order[state->have++]] = (unsigned short)BITS(3); DROPBITS(3); } while (state->have < 19) state->lens[order[state->have++]] = 0; state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 7; ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid code lengths set"; state->mode = BAD; break; } state->have = 0; state->mode = CODELENS; fallthrough; case CODELENS: while (state->have < state->nlen + state->ndist) { for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.val < 16) { NEEDBITS(this.bits); DROPBITS(this.bits); state->lens[state->have++] = this.val; } else { if (this.val == 16) { NEEDBITS(this.bits + 2); DROPBITS(this.bits); if (state->have == 0) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } len = state->lens[state->have - 1]; copy = 3 + BITS(2); DROPBITS(2); } else if (this.val == 17) { NEEDBITS(this.bits + 3); DROPBITS(this.bits); len = 0; copy = 3 + BITS(3); DROPBITS(3); } else { NEEDBITS(this.bits + 7); DROPBITS(this.bits); len = 0; copy = 11 + BITS(7); DROPBITS(7); } if (state->have + copy > state->nlen + state->ndist) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } while (copy--) state->lens[state->have++] = (unsigned short)len; } } /* handle error breaks in while */ if (state->mode == BAD) break; /* build code tables */ state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 9; ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid literal/lengths set"; state->mode = BAD; break; } state->distcode = (code const *)(state->next); state->distbits = 6; ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist, &(state->next), &(state->distbits), state->work); if (ret) { strm->msg = (char *)"invalid distances set"; state->mode = BAD; break; } state->mode = LEN; fallthrough; case LEN: if (have >= 6 && left >= 258) { RESTORE(); inflate_fast(strm, out); LOAD(); break; } for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.op && (this.op & 0xf0) == 0) { last = this; for (;;) { this = state->lencode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); state->length = (unsigned)this.val; if ((int)(this.op) == 0) { state->mode = LIT; break; } if (this.op & 32) { state->mode = TYPE; break; } if (this.op & 64) { strm->msg = (char *)"invalid literal/length code"; state->mode = BAD; break; } state->extra = (unsigned)(this.op) & 15; state->mode = LENEXT; fallthrough; case LENEXT: if (state->extra) { NEEDBITS(state->extra); state->length += BITS(state->extra); DROPBITS(state->extra); } state->mode = DIST; fallthrough; case DIST: for (;;) { this = state->distcode[BITS(state->distbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if ((this.op & 0xf0) == 0) { last = this; for (;;) { this = state->distcode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); if (this.op & 64) { strm->msg = (char *)"invalid distance code"; state->mode = BAD; break; } state->offset = (unsigned)this.val; state->extra = (unsigned)(this.op) & 15; state->mode = DISTEXT; fallthrough; case DISTEXT: if (state->extra) { NEEDBITS(state->extra); state->offset += BITS(state->extra); DROPBITS(state->extra); } #ifdef INFLATE_STRICT if (state->offset > state->dmax) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } #endif if (state->offset > state->whave + out - left) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } state->mode = MATCH; fallthrough; case MATCH: if (left == 0) goto inf_leave; copy = out - left; if (state->offset > copy) { /* copy from window */ copy = state->offset - copy; if (copy > state->write) { copy -= state->write; from = state->window + (state->wsize - copy); } else from = state->window + (state->write - copy); if (copy > state->length) copy = state->length; } else { /* copy from output */ from = put - state->offset; copy = state->length; } if (copy > left) copy = left; left -= copy; state->length -= copy; do { *put++ = *from++; } while (--copy); if (state->length == 0) state->mode = LEN; break; case LIT: if (left == 0) goto inf_leave; *put++ = (unsigned char)(state->length); left--; state->mode = LEN; break; case CHECK: if (state->wrap) { NEEDBITS(32); out -= left; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && out) strm->adler = state->check = UPDATE(state->check, put - out, out); out = left; if (( REVERSE(hold)) != state->check) { strm->msg = (char *)"incorrect data check"; state->mode = BAD; break; } INITBITS(); } state->mode = DONE; fallthrough; case DONE: ret = Z_STREAM_END; goto inf_leave; case BAD: ret = Z_DATA_ERROR; goto inf_leave; case MEM: return Z_MEM_ERROR; case SYNC: default: return Z_STREAM_ERROR; } /* Return from inflate(), updating the total counts and the check value. If there was no progress during the inflate() call, return a buffer error. Call zlib_updatewindow() to create and/or update the window state. */ inf_leave: RESTORE(); if (INFLATE_NEED_UPDATEWINDOW(strm) && (state->wsize || (state->mode < CHECK && out != strm->avail_out))) zlib_updatewindow(strm, out); in -= strm->avail_in; out -= strm->avail_out; strm->total_in += in; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && state->wrap && out) strm->adler = state->check = UPDATE(state->check, strm->next_out - out, out); strm->data_type = state->bits + (state->last ? 64 : 0) + (state->mode == TYPE ? 128 : 0); if (flush == Z_PACKET_FLUSH && ret == Z_OK && strm->avail_out != 0 && strm->avail_in == 0) return zlib_inflateSyncPacket(strm); if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK) ret = Z_BUF_ERROR; return ret; } int zlib_inflateEnd(z_streamp strm) { if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; return Z_OK; } /* * This subroutine adds the data at next_in/avail_in to the output history * without performing any output. The output buffer must be "caught up"; * i.e. no pending output but this should always be the case. The state must * be waiting on the start of a block (i.e. mode == TYPE or HEAD). On exit, * the output will also be caught up, and the checksum will have been updated * if need be. */ int zlib_inflateIncomp(z_stream *z) { struct inflate_state *state = (struct inflate_state *)z->state; Byte *saved_no = z->next_out; uInt saved_ao = z->avail_out; if (state->mode != TYPE && state->mode != HEAD) return Z_DATA_ERROR; /* Setup some variables to allow misuse of updateWindow */ z->avail_out = 0; z->next_out = (unsigned char*)z->next_in + z->avail_in; zlib_updatewindow(z, z->avail_in); /* Restore saved variables */ z->avail_out = saved_ao; z->next_out = saved_no; z->adler = state->check = UPDATE(state->check, z->next_in, z->avail_in); z->total_out += z->avail_in; z->total_in += z->avail_in; z->next_in += z->avail_in; state->total += z->avail_in; z->avail_in = 0; return Z_OK; }
4 4 4 4 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Socket Closing - normal and abnormal * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/workqueue.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/tcp.h> #include "smc.h" #include "smc_tx.h" #include "smc_cdc.h" #include "smc_close.h" /* release the clcsock that is assigned to the smc_sock */ void smc_clcsock_release(struct smc_sock *smc) { struct socket *tcp; if (smc->listen_smc && current_work() != &smc->smc_listen_work) cancel_work_sync(&smc->smc_listen_work); mutex_lock(&smc->clcsock_release_lock); if (smc->clcsock) { tcp = smc->clcsock; smc->clcsock = NULL; sock_release(tcp); } mutex_unlock(&smc->clcsock_release_lock); } static void smc_close_cleanup_listen(struct sock *parent) { struct sock *sk; /* Close non-accepted connections */ while ((sk = smc_accept_dequeue(parent, NULL))) smc_close_non_accepted(sk); } /* wait for sndbuf data being transmitted */ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = &smc->sk; if (!timeout) return; if (!smc_tx_prepared_sends(&smc->conn)) return; /* Send out corked data remaining in sndbuf */ smc_tx_pending(&smc->conn); smc->wait_close_tx_prepared = 1; add_wait_queue(sk_sleep(sk), &wait); while (!signal_pending(current) && timeout) { int rc; rc = sk_wait_event(sk, &timeout, !smc_tx_prepared_sends(&smc->conn) || READ_ONCE(sk->sk_err) == ECONNABORTED || READ_ONCE(sk->sk_err) == ECONNRESET || smc->conn.killed, &wait); if (rc) break; } remove_wait_queue(sk_sleep(sk), &wait); smc->wait_close_tx_prepared = 0; } void smc_close_wake_tx_prepared(struct smc_sock *smc) { if (smc->wait_close_tx_prepared) /* wake up socket closing */ smc->sk.sk_state_change(&smc->sk); } static int smc_close_wr(struct smc_connection *conn) { conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1; return smc_cdc_get_slot_and_msg_send(conn); } static int smc_close_final(struct smc_connection *conn) { if (atomic_read(&conn->bytes_to_rcv)) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; else conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1; if (conn->killed) return -EPIPE; return smc_cdc_get_slot_and_msg_send(conn); } int smc_close_abort(struct smc_connection *conn) { conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; return smc_cdc_get_slot_and_msg_send(conn); } static void smc_close_cancel_work(struct smc_sock *smc) { struct sock *sk = &smc->sk; release_sock(sk); if (cancel_work_sync(&smc->conn.close_work)) sock_put(sk); cancel_delayed_work_sync(&smc->conn.tx_work); lock_sock(sk); } /* terminate smc socket abnormally - active abort * link group is terminated, i.e. RDMA communication no longer possible */ void smc_close_active_abort(struct smc_sock *smc) { struct sock *sk = &smc->sk; bool release_clcsock = false; if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) { sk->sk_err = ECONNABORTED; if (smc->clcsock && smc->clcsock->sk) tcp_abort(smc->clcsock->sk, ECONNABORTED); } switch (sk->sk_state) { case SMC_ACTIVE: case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: sk->sk_state = SMC_PEERABORTWAIT; smc_close_cancel_work(smc); if (sk->sk_state != SMC_PEERABORTWAIT) break; sk->sk_state = SMC_CLOSED; sock_put(sk); /* (postponed) passive closing */ break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: case SMC_PEERFINCLOSEWAIT: sk->sk_state = SMC_PEERABORTWAIT; smc_close_cancel_work(smc); if (sk->sk_state != SMC_PEERABORTWAIT) break; sk->sk_state = SMC_CLOSED; smc_conn_free(&smc->conn); release_clcsock = true; sock_put(sk); /* passive closing */ break; case SMC_PROCESSABORT: case SMC_APPFINCLOSEWAIT: sk->sk_state = SMC_PEERABORTWAIT; smc_close_cancel_work(smc); if (sk->sk_state != SMC_PEERABORTWAIT) break; sk->sk_state = SMC_CLOSED; smc_conn_free(&smc->conn); release_clcsock = true; break; case SMC_INIT: case SMC_PEERABORTWAIT: case SMC_CLOSED: break; } smc_sock_set_flag(sk, SOCK_DEAD); sk->sk_state_change(sk); if (release_clcsock) { release_sock(sk); smc_clcsock_release(smc); lock_sock(sk); } } static inline bool smc_close_sent_any_close(struct smc_connection *conn) { return conn->local_tx_ctrl.conn_state_flags.peer_conn_abort || conn->local_tx_ctrl.conn_state_flags.peer_conn_closed; } int smc_close_active(struct smc_sock *smc) { struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; int old_state; long timeout; int rc = 0; int rc1 = 0; timeout = current->flags & PF_EXITING ? 0 : sock_flag(sk, SOCK_LINGER) ? sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; old_state = sk->sk_state; again: switch (sk->sk_state) { case SMC_INIT: sk->sk_state = SMC_CLOSED; break; case SMC_LISTEN: sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); /* wake up accept */ if (smc->clcsock && smc->clcsock->sk) { write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } smc_close_cleanup_listen(sk); release_sock(sk); flush_work(&smc->tcp_listen_work); lock_sock(sk); break; case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_state == SMC_ACTIVE) { /* send close request */ rc = smc_close_final(conn); sk->sk_state = SMC_PEERCLOSEWAIT1; /* actively shutdown clcsock before peer close it, * prevent peer from entering TIME_WAIT state. */ if (smc->clcsock && smc->clcsock->sk) { rc1 = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); rc = rc ? rc : rc1; } } else { /* peer event has changed the state */ goto again; } break; case SMC_APPFINCLOSEWAIT: /* socket already shutdown wr or both (active close) */ if (txflags->peer_done_writing && !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); } sk->sk_state = SMC_CLOSED; break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_state != SMC_APPCLOSEWAIT1 && sk->sk_state != SMC_APPCLOSEWAIT2) goto again; /* confirm close from peer */ rc = smc_close_final(conn); if (smc_cdc_rxed_any_close(conn)) { /* peer has closed the socket already */ sk->sk_state = SMC_CLOSED; sock_put(sk); /* postponed passive closing */ } else { /* peer has just issued a shutdown write */ sk->sk_state = SMC_PEERFINCLOSEWAIT; } break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: if (txflags->peer_done_writing && !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); } /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PEERFINCLOSEWAIT: /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PROCESSABORT: rc = smc_close_abort(conn); sk->sk_state = SMC_CLOSED; break; case SMC_PEERABORTWAIT: sk->sk_state = SMC_CLOSED; break; case SMC_CLOSED: /* nothing to do, add tracing in future patch */ break; } if (old_state != sk->sk_state) sk->sk_state_change(sk); return rc; } static void smc_close_passive_abort_received(struct smc_sock *smc) { struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; struct sock *sk = &smc->sk; switch (sk->sk_state) { case SMC_INIT: case SMC_ACTIVE: case SMC_APPCLOSEWAIT1: sk->sk_state = SMC_PROCESSABORT; sock_put(sk); /* passive closing */ break; case SMC_APPFINCLOSEWAIT: sk->sk_state = SMC_PROCESSABORT; break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: if (txflags->peer_done_writing && !smc_close_sent_any_close(&smc->conn)) /* just shutdown, but not yet closed locally */ sk->sk_state = SMC_PROCESSABORT; else sk->sk_state = SMC_CLOSED; sock_put(sk); /* passive closing */ break; case SMC_APPCLOSEWAIT2: case SMC_PEERFINCLOSEWAIT: sk->sk_state = SMC_CLOSED; sock_put(sk); /* passive closing */ break; case SMC_PEERABORTWAIT: sk->sk_state = SMC_CLOSED; break; case SMC_PROCESSABORT: /* nothing to do, add tracing in future patch */ break; } } /* Either some kind of closing has been received: peer_conn_closed, * peer_conn_abort, or peer_done_writing * or the link group of the connection terminates abnormally. */ static void smc_close_passive_work(struct work_struct *work) { struct smc_connection *conn = container_of(work, struct smc_connection, close_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); struct smc_cdc_conn_state_flags *rxflags; bool release_clcsock = false; struct sock *sk = &smc->sk; int old_state; lock_sock(sk); old_state = sk->sk_state; rxflags = &conn->local_rx_ctrl.conn_state_flags; if (rxflags->peer_conn_abort) { /* peer has not received all data */ smc_close_passive_abort_received(smc); release_sock(sk); cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); goto wakeup; } switch (sk->sk_state) { case SMC_INIT: sk->sk_state = SMC_APPCLOSEWAIT1; break; case SMC_ACTIVE: sk->sk_state = SMC_APPCLOSEWAIT1; /* postpone sock_put() for passive closing to cover * received SEND_SHUTDOWN as well */ break; case SMC_PEERCLOSEWAIT1: if (rxflags->peer_done_writing) sk->sk_state = SMC_PEERCLOSEWAIT2; fallthrough; /* to check for closing */ case SMC_PEERCLOSEWAIT2: if (!smc_cdc_rxed_any_close(conn)) break; if (sock_flag(sk, SOCK_DEAD) && smc_close_sent_any_close(conn)) { /* smc_release has already been called locally */ sk->sk_state = SMC_CLOSED; } else { /* just shutdown, but not yet closed locally */ sk->sk_state = SMC_APPFINCLOSEWAIT; } sock_put(sk); /* passive closing */ break; case SMC_PEERFINCLOSEWAIT: if (smc_cdc_rxed_any_close(conn)) { sk->sk_state = SMC_CLOSED; sock_put(sk); /* passive closing */ } break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: /* postpone sock_put() for passive closing to cover * received SEND_SHUTDOWN as well */ break; case SMC_APPFINCLOSEWAIT: case SMC_PEERABORTWAIT: case SMC_PROCESSABORT: case SMC_CLOSED: /* nothing to do, add tracing in future patch */ break; } wakeup: sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */ sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */ if (old_state != sk->sk_state) { sk->sk_state_change(sk); if ((sk->sk_state == SMC_CLOSED) && (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) { smc_conn_free(conn); if (smc->clcsock) release_clcsock = true; } } release_sock(sk); if (release_clcsock) smc_clcsock_release(smc); sock_put(sk); /* sock_hold done by schedulers of close_work */ } int smc_close_shutdown_write(struct smc_sock *smc) { struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; int old_state; long timeout; int rc = 0; timeout = current->flags & PF_EXITING ? 0 : sock_flag(sk, SOCK_LINGER) ? sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; old_state = sk->sk_state; again: switch (sk->sk_state) { case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_state != SMC_ACTIVE) goto again; /* send close wr request */ rc = smc_close_wr(conn); sk->sk_state = SMC_PEERCLOSEWAIT1; break; case SMC_APPCLOSEWAIT1: /* passive close */ if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_state != SMC_APPCLOSEWAIT1) goto again; /* confirm close from peer */ rc = smc_close_wr(conn); sk->sk_state = SMC_APPCLOSEWAIT2; break; case SMC_APPCLOSEWAIT2: case SMC_PEERFINCLOSEWAIT: case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: case SMC_APPFINCLOSEWAIT: case SMC_PROCESSABORT: case SMC_PEERABORTWAIT: /* nothing to do, add tracing in future patch */ break; } if (old_state != sk->sk_state) sk->sk_state_change(sk); return rc; } /* Initialize close properties on connection establishment. */ void smc_close_init(struct smc_sock *smc) { INIT_WORK(&smc->conn.close_work, smc_close_passive_work); }
1495 1494 1496 1499 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2012 Regents of the University of California * Copyright (C) 2017 SiFive * Copyright (C) 2018 Christoph Hellwig */ #include <linux/interrupt.h> #include <linux/irqchip.h> #include <linux/irqdomain.h> #include <linux/module.h> #include <linux/scs.h> #include <linux/seq_file.h> #include <asm/sbi.h> #include <asm/smp.h> #include <asm/softirq_stack.h> #include <asm/stacktrace.h> static struct fwnode_handle *(*__get_intc_node)(void); void riscv_set_intc_hwnode_fn(struct fwnode_handle *(*fn)(void)) { __get_intc_node = fn; } struct fwnode_handle *riscv_get_intc_hwnode(void) { if (__get_intc_node) return __get_intc_node(); return NULL; } EXPORT_SYMBOL_GPL(riscv_get_intc_hwnode); #ifdef CONFIG_IRQ_STACKS #include <asm/irq_stack.h> DECLARE_PER_CPU(ulong *, irq_shadow_call_stack_ptr); #ifdef CONFIG_SHADOW_CALL_STACK DEFINE_PER_CPU(ulong *, irq_shadow_call_stack_ptr); #endif static void init_irq_scs(void) { int cpu; if (!scs_is_enabled()) return; for_each_possible_cpu(cpu) per_cpu(irq_shadow_call_stack_ptr, cpu) = scs_alloc(cpu_to_node(cpu)); } DEFINE_PER_CPU(ulong *, irq_stack_ptr); #ifdef CONFIG_VMAP_STACK static void init_irq_stacks(void) { int cpu; ulong *p; for_each_possible_cpu(cpu) { p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, cpu_to_node(cpu)); per_cpu(irq_stack_ptr, cpu) = p; } } #else /* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ DEFINE_PER_CPU_ALIGNED(ulong [IRQ_STACK_SIZE/sizeof(ulong)], irq_stack); static void init_irq_stacks(void) { int cpu; for_each_possible_cpu(cpu) per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack, cpu); } #endif /* CONFIG_VMAP_STACK */ #ifdef CONFIG_SOFTIRQ_ON_OWN_STACK static void ___do_softirq(struct pt_regs *regs) { __do_softirq(); } void do_softirq_own_stack(void) { if (on_thread_stack()) call_on_irq_stack(NULL, ___do_softirq); else __do_softirq(); } #endif /* CONFIG_SOFTIRQ_ON_OWN_STACK */ #else static void init_irq_scs(void) {} static void init_irq_stacks(void) {} #endif /* CONFIG_IRQ_STACKS */ int arch_show_interrupts(struct seq_file *p, int prec) { show_ipi_stats(p, prec); return 0; } void __init init_IRQ(void) { init_irq_scs(); init_irq_stacks(); irqchip_init(); if (!handle_arch_irq) panic("No interrupt controller found."); sbi_ipi_init(); }
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 // SPDX-License-Identifier: (GPL-2.0-only OR Apache-2.0) /* * Generic implementation of the BLAKE2b digest algorithm. Based on the BLAKE2b * reference implementation, but it has been heavily modified for use in the * kernel. The reference implementation was: * * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under * the terms of the CC0, the OpenSSL Licence, or the Apache Public License * 2.0, at your option. The terms of these licenses can be found at: * * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 * - OpenSSL license : https://www.openssl.org/source/license.html * - Apache 2.0 : https://www.apache.org/licenses/LICENSE-2.0 * * More information about BLAKE2 can be found at https://blake2.net. */ #include <asm/unaligned.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/bitops.h> #include <crypto/internal/blake2b.h> #include <crypto/internal/hash.h> static const u8 blake2b_sigma[12][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } }; static void blake2b_increment_counter(struct blake2b_state *S, const u64 inc) { S->t[0] += inc; S->t[1] += (S->t[0] < inc); } #define G(r,i,a,b,c,d) \ do { \ a = a + b + m[blake2b_sigma[r][2*i+0]]; \ d = ror64(d ^ a, 32); \ c = c + d; \ b = ror64(b ^ c, 24); \ a = a + b + m[blake2b_sigma[r][2*i+1]]; \ d = ror64(d ^ a, 16); \ c = c + d; \ b = ror64(b ^ c, 63); \ } while (0) #define ROUND(r) \ do { \ G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ G(r,2,v[ 2],v[ 6],v[10],v[14]); \ G(r,3,v[ 3],v[ 7],v[11],v[15]); \ G(r,4,v[ 0],v[ 5],v[10],v[15]); \ G(r,5,v[ 1],v[ 6],v[11],v[12]); \ G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ } while (0) static void blake2b_compress_one_generic(struct blake2b_state *S, const u8 block[BLAKE2B_BLOCK_SIZE]) { u64 m[16]; u64 v[16]; size_t i; for (i = 0; i < 16; ++i) m[i] = get_unaligned_le64(block + i * sizeof(m[i])); for (i = 0; i < 8; ++i) v[i] = S->h[i]; v[ 8] = BLAKE2B_IV0; v[ 9] = BLAKE2B_IV1; v[10] = BLAKE2B_IV2; v[11] = BLAKE2B_IV3; v[12] = BLAKE2B_IV4 ^ S->t[0]; v[13] = BLAKE2B_IV5 ^ S->t[1]; v[14] = BLAKE2B_IV6 ^ S->f[0]; v[15] = BLAKE2B_IV7 ^ S->f[1]; ROUND(0); ROUND(1); ROUND(2); ROUND(3); ROUND(4); ROUND(5); ROUND(6); ROUND(7); ROUND(8); ROUND(9); ROUND(10); ROUND(11); #ifdef CONFIG_CC_IS_CLANG #pragma nounroll /* https://llvm.org/pr45803 */ #endif for (i = 0; i < 8; ++i) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; } #undef G #undef ROUND void blake2b_compress_generic(struct blake2b_state *state, const u8 *block, size_t nblocks, u32 inc) { do { blake2b_increment_counter(state, inc); blake2b_compress_one_generic(state, block); block += BLAKE2B_BLOCK_SIZE; } while (--nblocks); } EXPORT_SYMBOL(blake2b_compress_generic); static int crypto_blake2b_update_generic(struct shash_desc *desc, const u8 *in, unsigned int inlen) { return crypto_blake2b_update(desc, in, inlen, blake2b_compress_generic); } static int crypto_blake2b_final_generic(struct shash_desc *desc, u8 *out) { return crypto_blake2b_final(desc, out, blake2b_compress_generic); } #define BLAKE2B_ALG(name, driver_name, digest_size) \ { \ .base.cra_name = name, \ .base.cra_driver_name = driver_name, \ .base.cra_priority = 100, \ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \ .base.cra_blocksize = BLAKE2B_BLOCK_SIZE, \ .base.cra_ctxsize = sizeof(struct blake2b_tfm_ctx), \ .base.cra_module = THIS_MODULE, \ .digestsize = digest_size, \ .setkey = crypto_blake2b_setkey, \ .init = crypto_blake2b_init, \ .update = crypto_blake2b_update_generic, \ .final = crypto_blake2b_final_generic, \ .descsize = sizeof(struct blake2b_state), \ } static struct shash_alg blake2b_algs[] = { BLAKE2B_ALG("blake2b-160", "blake2b-160-generic", BLAKE2B_160_HASH_SIZE), BLAKE2B_ALG("blake2b-256", "blake2b-256-generic", BLAKE2B_256_HASH_SIZE), BLAKE2B_ALG("blake2b-384", "blake2b-384-generic", BLAKE2B_384_HASH_SIZE), BLAKE2B_ALG("blake2b-512", "blake2b-512-generic", BLAKE2B_512_HASH_SIZE), }; static int __init blake2b_mod_init(void) { return crypto_register_shashes(blake2b_algs, ARRAY_SIZE(blake2b_algs)); } static void __exit blake2b_mod_fini(void) { crypto_unregister_shashes(blake2b_algs, ARRAY_SIZE(blake2b_algs)); } subsys_initcall(blake2b_mod_init); module_exit(blake2b_mod_fini); MODULE_AUTHOR("David Sterba <kdave@kernel.org>"); MODULE_DESCRIPTION("BLAKE2b generic implementation"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("blake2b-160"); MODULE_ALIAS_CRYPTO("blake2b-160-generic"); MODULE_ALIAS_CRYPTO("blake2b-256"); MODULE_ALIAS_CRYPTO("blake2b-256-generic"); MODULE_ALIAS_CRYPTO("blake2b-384"); MODULE_ALIAS_CRYPTO("blake2b-384-generic"); MODULE_ALIAS_CRYPTO("blake2b-512"); MODULE_ALIAS_CRYPTO("blake2b-512-generic");
1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_skbprio.c SKB Priority Queue. * * Authors: Nishanth Devarajan, <ndev2021@gmail.com> * Cody Doucette, <doucette@bu.edu> * original idea by Michel Machado, Cody Doucette, and Qiaobin Fu */ #include <linux/string.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> #include <net/sch_generic.h> #include <net/inet_ecn.h> /* SKB Priority Queue * ================================= * * Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes * packets according to their skb->priority field. Under congestion, * Skbprio drops already-enqueued lower priority packets to make space * available for higher priority packets; it was conceived as a solution * for denial-of-service defenses that need to route packets with different * priorities as a mean to overcome DoS attacks. */ struct skbprio_sched_data { /* Queue state. */ struct sk_buff_head qdiscs[SKBPRIO_MAX_PRIORITY]; struct gnet_stats_queue qstats[SKBPRIO_MAX_PRIORITY]; u16 highest_prio; u16 lowest_prio; }; static u16 calc_new_high_prio(const struct skbprio_sched_data *q) { int prio; for (prio = q->highest_prio - 1; prio >= q->lowest_prio; prio--) { if (!skb_queue_empty(&q->qdiscs[prio])) return prio; } /* SKB queue is empty, return 0 (default highest priority setting). */ return 0; } static u16 calc_new_low_prio(const struct skbprio_sched_data *q) { int prio; for (prio = q->lowest_prio + 1; prio <= q->highest_prio; prio++) { if (!skb_queue_empty(&q->qdiscs[prio])) return prio; } /* SKB queue is empty, return SKBPRIO_MAX_PRIORITY - 1 * (default lowest priority setting). */ return SKBPRIO_MAX_PRIORITY - 1; } static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { const unsigned int max_priority = SKBPRIO_MAX_PRIORITY - 1; struct skbprio_sched_data *q = qdisc_priv(sch); struct sk_buff_head *qdisc; struct sk_buff_head *lp_qdisc; struct sk_buff *to_drop; u16 prio, lp; /* Obtain the priority of @skb. */ prio = min(skb->priority, max_priority); qdisc = &q->qdiscs[prio]; /* sch->limit can change under us from skbprio_change() */ if (sch->q.qlen < READ_ONCE(sch->limit)) { __skb_queue_tail(qdisc, skb); qdisc_qstats_backlog_inc(sch, skb); q->qstats[prio].backlog += qdisc_pkt_len(skb); /* Check to update highest and lowest priorities. */ if (prio > q->highest_prio) q->highest_prio = prio; if (prio < q->lowest_prio) q->lowest_prio = prio; sch->q.qlen++; return NET_XMIT_SUCCESS; } /* If this packet has the lowest priority, drop it. */ lp = q->lowest_prio; if (prio <= lp) { q->qstats[prio].drops++; q->qstats[prio].overlimits++; return qdisc_drop(skb, sch, to_free); } __skb_queue_tail(qdisc, skb); qdisc_qstats_backlog_inc(sch, skb); q->qstats[prio].backlog += qdisc_pkt_len(skb); /* Drop the packet at the tail of the lowest priority qdisc. */ lp_qdisc = &q->qdiscs[lp]; to_drop = __skb_dequeue_tail(lp_qdisc); BUG_ON(!to_drop); qdisc_qstats_backlog_dec(sch, to_drop); qdisc_drop(to_drop, sch, to_free); q->qstats[lp].backlog -= qdisc_pkt_len(to_drop); q->qstats[lp].drops++; q->qstats[lp].overlimits++; /* Check to update highest and lowest priorities. */ if (skb_queue_empty(lp_qdisc)) { if (q->lowest_prio == q->highest_prio) { /* The incoming packet is the only packet in queue. */ BUG_ON(sch->q.qlen != 1); q->lowest_prio = prio; q->highest_prio = prio; } else { q->lowest_prio = calc_new_low_prio(q); } } if (prio > q->highest_prio) q->highest_prio = prio; return NET_XMIT_CN; } static struct sk_buff *skbprio_dequeue(struct Qdisc *sch) { struct skbprio_sched_data *q = qdisc_priv(sch); struct sk_buff_head *hpq = &q->qdiscs[q->highest_prio]; struct sk_buff *skb = __skb_dequeue(hpq); if (unlikely(!skb)) return NULL; sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); q->qstats[q->highest_prio].backlog -= qdisc_pkt_len(skb); /* Update highest priority field. */ if (skb_queue_empty(hpq)) { if (q->lowest_prio == q->highest_prio) { BUG_ON(sch->q.qlen); q->highest_prio = 0; q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; } else { q->highest_prio = calc_new_high_prio(q); } } return skb; } static int skbprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct tc_skbprio_qopt *ctl = nla_data(opt); if (opt->nla_len != nla_attr_size(sizeof(*ctl))) return -EINVAL; WRITE_ONCE(sch->limit, ctl->limit); return 0; } static int skbprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct skbprio_sched_data *q = qdisc_priv(sch); int prio; /* Initialise all queues, one for each possible priority. */ for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) __skb_queue_head_init(&q->qdiscs[prio]); memset(&q->qstats, 0, sizeof(q->qstats)); q->highest_prio = 0; q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; sch->limit = 64; if (!opt) return 0; return skbprio_change(sch, opt, extack); } static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tc_skbprio_qopt opt; opt.limit = READ_ONCE(sch->limit); if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) return -1; return skb->len; } static void skbprio_reset(struct Qdisc *sch) { struct skbprio_sched_data *q = qdisc_priv(sch); int prio; for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) __skb_queue_purge(&q->qdiscs[prio]); memset(&q->qstats, 0, sizeof(q->qstats)); q->highest_prio = 0; q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; } static void skbprio_destroy(struct Qdisc *sch) { struct skbprio_sched_data *q = qdisc_priv(sch); int prio; for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) __skb_queue_purge(&q->qdiscs[prio]); } static struct Qdisc *skbprio_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; } static unsigned long skbprio_find(struct Qdisc *sch, u32 classid) { return 0; } static int skbprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { tcm->tcm_handle |= TC_H_MIN(cl); return 0; } static int skbprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct skbprio_sched_data *q = qdisc_priv(sch); if (gnet_stats_copy_queue(d, NULL, &q->qstats[cl - 1], q->qstats[cl - 1].qlen) < 0) return -1; return 0; } static void skbprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { unsigned int i; if (arg->stop) return; for (i = 0; i < SKBPRIO_MAX_PRIORITY; i++) { if (!tc_qdisc_stats_dump(sch, i + 1, arg)) break; } } static const struct Qdisc_class_ops skbprio_class_ops = { .leaf = skbprio_leaf, .find = skbprio_find, .dump = skbprio_dump_class, .dump_stats = skbprio_dump_class_stats, .walk = skbprio_walk, }; static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = { .cl_ops = &skbprio_class_ops, .id = "skbprio", .priv_size = sizeof(struct skbprio_sched_data), .enqueue = skbprio_enqueue, .dequeue = skbprio_dequeue, .peek = qdisc_peek_dequeued, .init = skbprio_init, .reset = skbprio_reset, .change = skbprio_change, .dump = skbprio_dump, .destroy = skbprio_destroy, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("skbprio"); static int __init skbprio_module_init(void) { return register_qdisc(&skbprio_qdisc_ops); } static void __exit skbprio_module_exit(void) { unregister_qdisc(&skbprio_qdisc_ops); } module_init(skbprio_module_init) module_exit(skbprio_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SKB priority based scheduling qdisc");
11 2 3 3 6 6 2 2 2 4 4 4 4 4 1 1 1 1 1