Total coverage: 174228 (13%)of 1373067
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2000-2002 Vojtech Pavlik <vojtech@ucw.cz> * Copyright (c) 2001-2002, 2007 Johann Deneux <johann.deneux@gmail.com> * * USB/RS232 I-Force joysticks and wheels. */ #include <linux/usb.h> #include "iforce.h" struct iforce_usb { struct iforce iforce; struct usb_device *usbdev; struct usb_interface *intf; struct urb *irq, *out; u8 data_in[IFORCE_MAX_LENGTH] ____cacheline_aligned; u8 data_out[IFORCE_MAX_LENGTH] ____cacheline_aligned; }; static void __iforce_usb_xmit(struct iforce *iforce) { struct iforce_usb *iforce_usb = container_of(iforce, struct iforce_usb, iforce); int n, c; guard(spinlock_irqsave)(&iforce->xmit_lock); if (iforce->xmit.head == iforce->xmit.tail) { iforce_clear_xmit_and_wake(iforce); return; } ((char *)iforce_usb->out->transfer_buffer)[0] = iforce->xmit.buf[iforce->xmit.tail]; XMIT_INC(iforce->xmit.tail, 1); n = iforce->xmit.buf[iforce->xmit.tail]; XMIT_INC(iforce->xmit.tail, 1); iforce_usb->out->transfer_buffer_length = n + 1; iforce_usb->out->dev = iforce_usb->usbdev; /* Copy rest of data then */ c = CIRC_CNT_TO_END(iforce->xmit.head, iforce->xmit.tail, XMIT_SIZE); if (n < c) c = n; memcpy(iforce_usb->out->transfer_buffer + 1, &iforce->xmit.buf[iforce->xmit.tail], c); if (n != c) { memcpy(iforce_usb->out->transfer_buffer + 1 + c, &iforce->xmit.buf[0], n - c); } XMIT_INC(iforce->xmit.tail, n); n=usb_submit_urb(iforce_usb->out, GFP_ATOMIC); if (n) { dev_warn(&iforce_usb->intf->dev, "usb_submit_urb failed %d\n", n); iforce_clear_xmit_and_wake(iforce); } /* The IFORCE_XMIT_RUNNING bit is not cleared here. That's intended. * As long as the urb completion handler is not called, the transmiting * is considered to be running */ } static void iforce_usb_xmit(struct iforce *iforce) { if (!test_and_set_bit(IFORCE_XMIT_RUNNING, iforce->xmit_flags)) __iforce_usb_xmit(iforce); } static int iforce_usb_get_id(struct iforce *iforce, u8 id, u8 *response_data, size_t *response_len) { struct iforce_usb *iforce_usb = container_of(iforce, struct iforce_usb, iforce); u8 *buf; int status; buf = kmalloc(IFORCE_MAX_LENGTH, GFP_KERNEL); if (!buf) return -ENOMEM; status = usb_control_msg(iforce_usb->usbdev, usb_rcvctrlpipe(iforce_usb->usbdev, 0), id, USB_TYPE_VENDOR | USB_DIR_IN | USB_RECIP_INTERFACE, 0, 0, buf, IFORCE_MAX_LENGTH, 1000); if (status < 0) { dev_err(&iforce_usb->intf->dev, "usb_submit_urb failed: %d\n", status); } else if (buf[0] != id) { status = -EIO; } else { memcpy(response_data, buf, status); *response_len = status; status = 0; } kfree(buf); return status; } static int iforce_usb_start_io(struct iforce *iforce) { struct iforce_usb *iforce_usb = container_of(iforce, struct iforce_usb, iforce); if (usb_submit_urb(iforce_usb->irq, GFP_KERNEL)) return -EIO; return 0; } static void iforce_usb_stop_io(struct iforce *iforce) { struct iforce_usb *iforce_usb = container_of(iforce, struct iforce_usb, iforce); usb_kill_urb(iforce_usb->irq); usb_kill_urb(iforce_usb->out); } static const struct iforce_xport_ops iforce_usb_xport_ops = { .xmit = iforce_usb_xmit, .get_id = iforce_usb_get_id, .start_io = iforce_usb_start_io, .stop_io = iforce_usb_stop_io, }; static void iforce_usb_irq(struct urb *urb) { struct iforce_usb *iforce_usb = urb->context; struct iforce *iforce = &iforce_usb->iforce; struct device *dev = &iforce_usb->intf->dev; int status; switch (urb->status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ dev_dbg(dev, "%s - urb shutting down with status: %d\n", __func__, urb->status); return; default: dev_dbg(dev, "%s - urb has status of: %d\n", __func__, urb->status); goto exit; } iforce_process_packet(iforce, iforce_usb->data_in[0], iforce_usb->data_in + 1, urb->actual_length - 1); exit: status = usb_submit_urb(urb, GFP_ATOMIC); if (status) dev_err(dev, "%s - usb_submit_urb failed with result %d\n", __func__, status); } static void iforce_usb_out(struct urb *urb) { struct iforce_usb *iforce_usb = urb->context; struct iforce *iforce = &iforce_usb->iforce; if (urb->status) { dev_dbg(&iforce_usb->intf->dev, "urb->status %d, exiting\n", urb->status); iforce_clear_xmit_and_wake(iforce); return; } __iforce_usb_xmit(iforce); wake_up_all(&iforce->wait); } static int iforce_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *dev = interface_to_usbdev(intf); struct usb_host_interface *interface; struct usb_endpoint_descriptor *epirq, *epout; struct iforce_usb *iforce_usb; int err = -ENOMEM; interface = intf->cur_altsetting; if (interface->desc.bNumEndpoints < 2) return -ENODEV; epirq = &interface->endpoint[0].desc; if (!usb_endpoint_is_int_in(epirq)) return -ENODEV; epout = &interface->endpoint[1].desc; if (!usb_endpoint_is_int_out(epout)) return -ENODEV; iforce_usb = kzalloc(sizeof(*iforce_usb), GFP_KERNEL); if (!iforce_usb) goto fail; iforce_usb->irq = usb_alloc_urb(0, GFP_KERNEL); if (!iforce_usb->irq) goto fail; iforce_usb->out = usb_alloc_urb(0, GFP_KERNEL); if (!iforce_usb->out) goto fail; iforce_usb->iforce.xport_ops = &iforce_usb_xport_ops; iforce_usb->usbdev = dev; iforce_usb->intf = intf; usb_fill_int_urb(iforce_usb->irq, dev, usb_rcvintpipe(dev, epirq->bEndpointAddress), iforce_usb->data_in, sizeof(iforce_usb->data_in), iforce_usb_irq, iforce_usb, epirq->bInterval); usb_fill_int_urb(iforce_usb->out, dev, usb_sndintpipe(dev, epout->bEndpointAddress), iforce_usb->data_out, sizeof(iforce_usb->data_out), iforce_usb_out, iforce_usb, epout->bInterval); err = iforce_init_device(&intf->dev, BUS_USB, &iforce_usb->iforce); if (err) goto fail; usb_set_intfdata(intf, iforce_usb); return 0; fail: if (iforce_usb) { usb_free_urb(iforce_usb->irq); usb_free_urb(iforce_usb->out); kfree(iforce_usb); } return err; } static void iforce_usb_disconnect(struct usb_interface *intf) { struct iforce_usb *iforce_usb = usb_get_intfdata(intf); usb_set_intfdata(intf, NULL); input_unregister_device(iforce_usb->iforce.dev); usb_free_urb(iforce_usb->irq); usb_free_urb(iforce_usb->out); kfree(iforce_usb); } static const struct usb_device_id iforce_usb_ids[] = { { USB_DEVICE(0x044f, 0xa01c) }, /* Thrustmaster Motor Sport GT */ { USB_DEVICE(0x046d, 0xc281) }, /* Logitech WingMan Force */ { USB_DEVICE(0x046d, 0xc291) }, /* Logitech WingMan Formula Force */ { USB_DEVICE(0x05ef, 0x020a) }, /* AVB Top Shot Pegasus */ { USB_DEVICE(0x05ef, 0x8884) }, /* AVB Mag Turbo Force */ { USB_DEVICE(0x05ef, 0x8888) }, /* AVB Top Shot FFB Racing Wheel */ { USB_DEVICE(0x061c, 0xc0a4) }, /* ACT LABS Force RS */ { USB_DEVICE(0x061c, 0xc084) }, /* ACT LABS Force RS */ { USB_DEVICE(0x06a3, 0xff04) }, /* Saitek R440 Force Wheel */ { USB_DEVICE(0x06f8, 0x0001) }, /* Guillemot Race Leader Force Feedback */ { USB_DEVICE(0x06f8, 0x0003) }, /* Guillemot Jet Leader Force Feedback */ { USB_DEVICE(0x06f8, 0x0004) }, /* Guillemot Force Feedback Racing Wheel */ { USB_DEVICE(0x06f8, 0xa302) }, /* Guillemot Jet Leader 3D */ { } /* Terminating entry */ }; MODULE_DEVICE_TABLE (usb, iforce_usb_ids); struct usb_driver iforce_usb_driver = { .name = "iforce", .probe = iforce_usb_probe, .disconnect = iforce_usb_disconnect, .id_table = iforce_usb_ids, }; module_usb_driver(iforce_usb_driver); MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>, Johann Deneux <johann.deneux@gmail.com>"); MODULE_DESCRIPTION("USB I-Force joysticks and wheels driver"); MODULE_LICENSE("GPL");
22 22 20 20 19 20 20 20 11 11 11 20 20 20 20 20 20 20 15 11 11 11 9 20 20 11 96 96 17 96 17 94 96 96 96 96 96 96 96 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 // SPDX-License-Identifier: GPL-2.0-or-later /* * Sound core. This file is composed of two parts. sound_class * which is common to both OSS and ALSA and OSS sound core which * is used OSS or emulation of it. */ /* * First, the common part. */ #include <linux/module.h> #include <linux/device.h> #include <linux/err.h> #include <linux/kdev_t.h> #include <linux/major.h> #include <sound/core.h> #ifdef CONFIG_SOUND_OSS_CORE static int __init init_oss_soundcore(void); static void cleanup_oss_soundcore(void); #else static inline int init_oss_soundcore(void) { return 0; } static inline void cleanup_oss_soundcore(void) { } #endif MODULE_DESCRIPTION("Core sound module"); MODULE_AUTHOR("Alan Cox"); MODULE_LICENSE("GPL"); static char *sound_devnode(const struct device *dev, umode_t *mode) { if (MAJOR(dev->devt) == SOUND_MAJOR) return NULL; return kasprintf(GFP_KERNEL, "snd/%s", dev_name(dev)); } const struct class sound_class = { .name = "sound", .devnode = sound_devnode, }; EXPORT_SYMBOL(sound_class); static int __init init_soundcore(void) { int rc; rc = init_oss_soundcore(); if (rc) return rc; rc = class_register(&sound_class); if (rc) { cleanup_oss_soundcore(); return rc; } return 0; } static void __exit cleanup_soundcore(void) { cleanup_oss_soundcore(); class_unregister(&sound_class); } subsys_initcall(init_soundcore); module_exit(cleanup_soundcore); #ifdef CONFIG_SOUND_OSS_CORE /* * OSS sound core handling. Breaks out sound functions to submodules * * Author: Alan Cox <alan@lxorguk.ukuu.org.uk> * * Fixes: * * -------------------- * * Top level handler for the sound subsystem. Various devices can * plug into this. The fact they don't all go via OSS doesn't mean * they don't have to implement the OSS API. There is a lot of logic * to keeping much of the OSS weight out of the code in a compatibility * module, but it's up to the driver to rember to load it... * * The code provides a set of functions for registration of devices * by type. This is done rather than providing a single call so that * we can hide any future changes in the internals (eg when we go to * 32bit dev_t) from the modules and their interface. * * Secondly we need to allocate the dsp, dsp16 and audio devices as * one. Thus we misuse the chains a bit to simplify this. * * Thirdly to make it more fun and for 2.3.x and above we do all * of this using fine grained locking. * * FIXME: we have to resolve modules and fine grained load/unload * locking at some point in 2.3.x. */ #include <linux/init.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sound.h> #include <linux/kmod.h> #define SOUND_STEP 16 struct sound_unit { int unit_minor; const struct file_operations *unit_fops; struct sound_unit *next; char name[32]; }; /* * By default, OSS sound_core claims full legacy minor range (0-255) * of SOUND_MAJOR to trap open attempts to any sound minor and * requests modules using custom sound-slot/service-* module aliases. * The only benefit of doing this is allowing use of custom module * aliases instead of the standard char-major-* ones. This behavior * prevents alternative OSS implementation and is scheduled to be * removed. * * CONFIG_SOUND_OSS_CORE_PRECLAIM and soundcore.preclaim_oss kernel * parameter are added to allow distros and developers to try and * switch to alternative implementations without needing to rebuild * the kernel in the meantime. If preclaim_oss is non-zero, the * kernel will behave the same as before. All SOUND_MAJOR minors are * preclaimed and the custom module aliases along with standard chrdev * ones are emitted if a missing device is opened. If preclaim_oss is * zero, sound_core only grabs what's actually in use and for missing * devices only the standard chrdev aliases are requested. * * All these clutters are scheduled to be removed along with * sound-slot/service-* module aliases. */ static int preclaim_oss = IS_ENABLED(CONFIG_SOUND_OSS_CORE_PRECLAIM); module_param(preclaim_oss, int, 0444); static int soundcore_open(struct inode *, struct file *); static const struct file_operations soundcore_fops = { /* We must have an owner or the module locking fails */ .owner = THIS_MODULE, .open = soundcore_open, .llseek = noop_llseek, }; /* * Low level list operator. Scan the ordered list, find a hole and * join into it. Called with the lock asserted */ static int __sound_insert_unit(struct sound_unit * s, struct sound_unit **list, const struct file_operations *fops, int index, int low, int top) { int n=low; if (index < 0) { /* first free */ while (*list && (*list)->unit_minor<n) list=&((*list)->next); while(n<top) { /* Found a hole ? */ if(*list==NULL || (*list)->unit_minor>n) break; list=&((*list)->next); n+=SOUND_STEP; } if(n>=top) return -ENOENT; } else { n = low+(index*16); while (*list) { if ((*list)->unit_minor==n) return -EBUSY; if ((*list)->unit_minor>n) break; list=&((*list)->next); } } /* * Fill it in */ s->unit_minor=n; s->unit_fops=fops; /* * Link it */ s->next=*list; *list=s; return n; } /* * Remove a node from the chain. Called with the lock asserted */ static struct sound_unit *__sound_remove_unit(struct sound_unit **list, int unit) { while(*list) { struct sound_unit *p=*list; if(p->unit_minor==unit) { *list=p->next; return p; } list=&(p->next); } printk(KERN_ERR "Sound device %d went missing!\n", unit); return NULL; } /* * This lock guards the sound loader list. */ static DEFINE_SPINLOCK(sound_loader_lock); /* * Allocate the controlling structure and add it to the sound driver * list. Acquires locks as needed */ static int sound_insert_unit(struct sound_unit **list, const struct file_operations *fops, int index, int low, int top, const char *name, umode_t mode, struct device *dev) { struct sound_unit *s = kmalloc(sizeof(*s), GFP_KERNEL); int r; if (!s) return -ENOMEM; spin_lock(&sound_loader_lock); retry: r = __sound_insert_unit(s, list, fops, index, low, top); spin_unlock(&sound_loader_lock); if (r < 0) goto fail; else if (r < SOUND_STEP) sprintf(s->name, "sound/%s", name); else sprintf(s->name, "sound/%s%d", name, r / SOUND_STEP); if (!preclaim_oss) { /* * Something else might have grabbed the minor. If * first free slot is requested, rescan with @low set * to the next unit; otherwise, -EBUSY. */ r = __register_chrdev(SOUND_MAJOR, s->unit_minor, 1, s->name, &soundcore_fops); if (r < 0) { spin_lock(&sound_loader_lock); __sound_remove_unit(list, s->unit_minor); if (index < 0) { low = s->unit_minor + SOUND_STEP; goto retry; } spin_unlock(&sound_loader_lock); r = -EBUSY; goto fail; } } device_create(&sound_class, dev, MKDEV(SOUND_MAJOR, s->unit_minor), NULL, "%s", s->name+6); return s->unit_minor; fail: kfree(s); return r; } /* * Remove a unit. Acquires locks as needed. The drivers MUST have * completed the removal before their file operations become * invalid. */ static void sound_remove_unit(struct sound_unit **list, int unit) { struct sound_unit *p; spin_lock(&sound_loader_lock); p = __sound_remove_unit(list, unit); spin_unlock(&sound_loader_lock); if (p) { if (!preclaim_oss) __unregister_chrdev(SOUND_MAJOR, p->unit_minor, 1, p->name); device_destroy(&sound_class, MKDEV(SOUND_MAJOR, p->unit_minor)); kfree(p); } } /* * Allocations * * 0 *16 Mixers * 1 *8 Sequencers * 2 *16 Midi * 3 *16 DSP * 4 *16 SunDSP * 5 *16 DSP16 * 6 -- sndstat (obsolete) * 7 *16 unused * 8 -- alternate sequencer (see above) * 9 *16 raw synthesizer access * 10 *16 unused * 11 *16 unused * 12 *16 unused * 13 *16 unused * 14 *16 unused * 15 *16 unused */ static struct sound_unit *chains[SOUND_STEP]; /** * register_sound_special_device - register a special sound node * @fops: File operations for the driver * @unit: Unit number to allocate * @dev: device pointer * * Allocate a special sound device by minor number from the sound * subsystem. * * Return: The allocated number is returned on success. On failure, * a negative error code is returned. */ int register_sound_special_device(const struct file_operations *fops, int unit, struct device *dev) { const int chain = unit % SOUND_STEP; int max_unit = 256; const char *name; char _name[16]; switch (chain) { case 0: name = "mixer"; break; case 1: name = "sequencer"; if (unit >= SOUND_STEP) goto __unknown; max_unit = unit + 1; break; case 2: name = "midi"; break; case 3: name = "dsp"; break; case 4: name = "audio"; break; case 5: name = "dspW"; break; case 8: name = "sequencer2"; if (unit >= SOUND_STEP) goto __unknown; max_unit = unit + 1; break; case 9: name = "dmmidi"; break; case 10: name = "dmfm"; break; case 12: name = "adsp"; break; case 13: name = "amidi"; break; case 14: name = "admmidi"; break; default: { __unknown: sprintf(_name, "unknown%d", chain); if (unit >= SOUND_STEP) strcat(_name, "-"); name = _name; } break; } return sound_insert_unit(&chains[chain], fops, -1, unit, max_unit, name, 0600, dev); } EXPORT_SYMBOL(register_sound_special_device); int register_sound_special(const struct file_operations *fops, int unit) { return register_sound_special_device(fops, unit, NULL); } EXPORT_SYMBOL(register_sound_special); /** * register_sound_mixer - register a mixer device * @fops: File operations for the driver * @dev: Unit number to allocate * * Allocate a mixer device. Unit is the number of the mixer requested. * Pass -1 to request the next free mixer unit. * * Return: On success, the allocated number is returned. On failure, * a negative error code is returned. */ int register_sound_mixer(const struct file_operations *fops, int dev) { return sound_insert_unit(&chains[0], fops, dev, 0, 128, "mixer", 0600, NULL); } EXPORT_SYMBOL(register_sound_mixer); /* * DSP's are registered as a triple. Register only one and cheat * in open - see below. */ /** * register_sound_dsp - register a DSP device * @fops: File operations for the driver * @dev: Unit number to allocate * * Allocate a DSP device. Unit is the number of the DSP requested. * Pass -1 to request the next free DSP unit. * * This function allocates both the audio and dsp device entries together * and will always allocate them as a matching pair - eg dsp3/audio3 * * Return: On success, the allocated number is returned. On failure, * a negative error code is returned. */ int register_sound_dsp(const struct file_operations *fops, int dev) { return sound_insert_unit(&chains[3], fops, dev, 3, 131, "dsp", 0600, NULL); } EXPORT_SYMBOL(register_sound_dsp); /** * unregister_sound_special - unregister a special sound device * @unit: unit number to allocate * * Release a sound device that was allocated with * register_sound_special(). The unit passed is the return value from * the register function. */ void unregister_sound_special(int unit) { sound_remove_unit(&chains[unit % SOUND_STEP], unit); } EXPORT_SYMBOL(unregister_sound_special); /** * unregister_sound_mixer - unregister a mixer * @unit: unit number to allocate * * Release a sound device that was allocated with register_sound_mixer(). * The unit passed is the return value from the register function. */ void unregister_sound_mixer(int unit) { sound_remove_unit(&chains[0], unit); } EXPORT_SYMBOL(unregister_sound_mixer); /** * unregister_sound_dsp - unregister a DSP device * @unit: unit number to allocate * * Release a sound device that was allocated with register_sound_dsp(). * The unit passed is the return value from the register function. * * Both of the allocated units are released together automatically. */ void unregister_sound_dsp(int unit) { sound_remove_unit(&chains[3], unit); } EXPORT_SYMBOL(unregister_sound_dsp); static struct sound_unit *__look_for_unit(int chain, int unit) { struct sound_unit *s; s=chains[chain]; while(s && s->unit_minor <= unit) { if(s->unit_minor==unit) return s; s=s->next; } return NULL; } static int soundcore_open(struct inode *inode, struct file *file) { int chain; int unit = iminor(inode); struct sound_unit *s; const struct file_operations *new_fops = NULL; chain=unit&0x0F; if(chain==4 || chain==5) /* dsp/audio/dsp16 */ { unit&=0xF0; unit|=3; chain=3; } spin_lock(&sound_loader_lock); s = __look_for_unit(chain, unit); if (s) new_fops = fops_get(s->unit_fops); if (preclaim_oss && !new_fops) { spin_unlock(&sound_loader_lock); /* * Please, don't change this order or code. * For ALSA slot means soundcard and OSS emulation code * comes as add-on modules which aren't depend on * ALSA toplevel modules for soundcards, thus we need * load them at first. [Jaroslav Kysela <perex@jcu.cz>] */ request_module("sound-slot-%i", unit>>4); request_module("sound-service-%i-%i", unit>>4, chain); /* * sound-slot/service-* module aliases are scheduled * for removal in favor of the standard char-major-* * module aliases. For the time being, generate both * the legacy and standard module aliases to ease * transition. */ if (request_module("char-major-%d-%d", SOUND_MAJOR, unit) > 0) request_module("char-major-%d", SOUND_MAJOR); spin_lock(&sound_loader_lock); s = __look_for_unit(chain, unit); if (s) new_fops = fops_get(s->unit_fops); } spin_unlock(&sound_loader_lock); if (!new_fops) return -ENODEV; /* * We rely upon the fact that we can't be unloaded while the * subdriver is there. */ replace_fops(file, new_fops); if (!file->f_op->open) return -ENODEV; return file->f_op->open(inode, file); } MODULE_ALIAS_CHARDEV_MAJOR(SOUND_MAJOR); static void cleanup_oss_soundcore(void) { /* We have nothing to really do here - we know the lists must be empty */ unregister_chrdev(SOUND_MAJOR, "sound"); } static int __init init_oss_soundcore(void) { if (preclaim_oss && register_chrdev(SOUND_MAJOR, "sound", &soundcore_fops) < 0) { printk(KERN_ERR "soundcore: sound device already in use.\n"); return -EBUSY; } return 0; } #endif /* CONFIG_SOUND_OSS_CORE */
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2018 Oracle and/or its affiliates. All rights reserved. */ #include <crypto/aead.h> #include <linux/debugfs.h> #include <net/xfrm.h> #include "netdevsim.h" #define NSIM_IPSEC_AUTH_BITS 128 static ssize_t nsim_dbg_netdev_ops_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { struct netdevsim *ns = filp->private_data; struct nsim_ipsec *ipsec = &ns->ipsec; size_t bufsize; char *buf, *p; int len; int i; /* the buffer needed is * (num SAs * 3 lines each * ~60 bytes per line) + one more line */ bufsize = (ipsec->count * 4 * 60) + 60; buf = kzalloc(bufsize, GFP_KERNEL); if (!buf) return -ENOMEM; p = buf; p += scnprintf(p, bufsize - (p - buf), "SA count=%u tx=%u\n", ipsec->count, ipsec->tx); for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) { struct nsim_sa *sap = &ipsec->sa[i]; if (!sap->used) continue; if (sap->xs->props.family == AF_INET6) p += scnprintf(p, bufsize - (p - buf), "sa[%i] %cx ipaddr=%pI6c\n", i, (sap->rx ? 'r' : 't'), &sap->ipaddr); else p += scnprintf(p, bufsize - (p - buf), "sa[%i] %cx ipaddr=%pI4\n", i, (sap->rx ? 'r' : 't'), &sap->ipaddr[3]); p += scnprintf(p, bufsize - (p - buf), "sa[%i] spi=0x%08x proto=0x%x salt=0x%08x crypt=%d\n", i, be32_to_cpu(sap->xs->id.spi), sap->xs->id.proto, sap->salt, sap->crypt); p += scnprintf(p, bufsize - (p - buf), "sa[%i] key=0x%08x %08x %08x %08x\n", i, sap->key[0], sap->key[1], sap->key[2], sap->key[3]); } len = simple_read_from_buffer(buffer, count, ppos, buf, p - buf); kfree(buf); return len; } static const struct file_operations ipsec_dbg_fops = { .owner = THIS_MODULE, .open = simple_open, .read = nsim_dbg_netdev_ops_read, }; static int nsim_ipsec_find_empty_idx(struct nsim_ipsec *ipsec) { u32 i; if (ipsec->count == NSIM_IPSEC_MAX_SA_COUNT) return -ENOSPC; /* search sa table */ for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) { if (!ipsec->sa[i].used) return i; } return -ENOSPC; } static int nsim_ipsec_parse_proto_keys(struct net_device *dev, struct xfrm_state *xs, u32 *mykey, u32 *mysalt) { const char aes_gcm_name[] = "rfc4106(gcm(aes))"; unsigned char *key_data; char *alg_name = NULL; int key_len; if (!xs->aead) { netdev_err(dev, "Unsupported IPsec algorithm\n"); return -EINVAL; } if (xs->aead->alg_icv_len != NSIM_IPSEC_AUTH_BITS) { netdev_err(dev, "IPsec offload requires %d bit authentication\n", NSIM_IPSEC_AUTH_BITS); return -EINVAL; } key_data = &xs->aead->alg_key[0]; key_len = xs->aead->alg_key_len; alg_name = xs->aead->alg_name; if (strcmp(alg_name, aes_gcm_name)) { netdev_err(dev, "Unsupported IPsec algorithm - please use %s\n", aes_gcm_name); return -EINVAL; } /* 160 accounts for 16 byte key and 4 byte salt */ if (key_len > NSIM_IPSEC_AUTH_BITS) { *mysalt = ((u32 *)key_data)[4]; } else if (key_len == NSIM_IPSEC_AUTH_BITS) { *mysalt = 0; } else { netdev_err(dev, "IPsec hw offload only supports 128 bit keys with optional 32 bit salt\n"); return -EINVAL; } memcpy(mykey, key_data, 16); return 0; } static int nsim_ipsec_add_sa(struct net_device *dev, struct xfrm_state *xs, struct netlink_ext_ack *extack) { struct nsim_ipsec *ipsec; struct netdevsim *ns; struct nsim_sa sa; u16 sa_idx; int ret; ns = netdev_priv(dev); ipsec = &ns->ipsec; if (xs->id.proto != IPPROTO_ESP && xs->id.proto != IPPROTO_AH) { NL_SET_ERR_MSG_MOD(extack, "Unsupported protocol for ipsec offload"); return -EINVAL; } if (xs->calg) { NL_SET_ERR_MSG_MOD(extack, "Compression offload not supported"); return -EINVAL; } if (xs->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) { NL_SET_ERR_MSG_MOD(extack, "Unsupported ipsec offload type"); return -EINVAL; } /* find the first unused index */ ret = nsim_ipsec_find_empty_idx(ipsec); if (ret < 0) { NL_SET_ERR_MSG_MOD(extack, "No space for SA in Rx table!"); return ret; } sa_idx = (u16)ret; memset(&sa, 0, sizeof(sa)); sa.used = true; sa.xs = xs; if (sa.xs->id.proto & IPPROTO_ESP) sa.crypt = xs->ealg || xs->aead; /* get the key and salt */ ret = nsim_ipsec_parse_proto_keys(dev, xs, sa.key, &sa.salt); if (ret) { NL_SET_ERR_MSG_MOD(extack, "Failed to get key data for SA table"); return ret; } if (xs->xso.dir == XFRM_DEV_OFFLOAD_IN) sa.rx = true; if (xs->props.family == AF_INET6) memcpy(sa.ipaddr, &xs->id.daddr.a6, 16); else memcpy(&sa.ipaddr[3], &xs->id.daddr.a4, 4); /* the preparations worked, so save the info */ memcpy(&ipsec->sa[sa_idx], &sa, sizeof(sa)); /* the XFRM stack doesn't like offload_handle == 0, * so add a bitflag in case our array index is 0 */ xs->xso.offload_handle = sa_idx | NSIM_IPSEC_VALID; ipsec->count++; return 0; } static void nsim_ipsec_del_sa(struct net_device *dev, struct xfrm_state *xs) { struct netdevsim *ns = netdev_priv(dev); struct nsim_ipsec *ipsec = &ns->ipsec; u16 sa_idx; sa_idx = xs->xso.offload_handle & ~NSIM_IPSEC_VALID; if (!ipsec->sa[sa_idx].used) { netdev_err(ns->netdev, "Invalid SA for delete sa_idx=%d\n", sa_idx); return; } memset(&ipsec->sa[sa_idx], 0, sizeof(struct nsim_sa)); ipsec->count--; } static const struct xfrmdev_ops nsim_xfrmdev_ops = { .xdo_dev_state_add = nsim_ipsec_add_sa, .xdo_dev_state_delete = nsim_ipsec_del_sa, }; bool nsim_ipsec_tx(struct netdevsim *ns, struct sk_buff *skb) { struct sec_path *sp = skb_sec_path(skb); struct nsim_ipsec *ipsec = &ns->ipsec; struct xfrm_state *xs; struct nsim_sa *tsa; u32 sa_idx; /* do we even need to check this packet? */ if (!sp) return true; if (unlikely(!sp->len)) { netdev_err(ns->netdev, "no xfrm state len = %d\n", sp->len); return false; } xs = xfrm_input_state(skb); if (unlikely(!xs)) { netdev_err(ns->netdev, "no xfrm_input_state() xs = %p\n", xs); return false; } sa_idx = xs->xso.offload_handle & ~NSIM_IPSEC_VALID; if (unlikely(sa_idx >= NSIM_IPSEC_MAX_SA_COUNT)) { netdev_err(ns->netdev, "bad sa_idx=%d max=%d\n", sa_idx, NSIM_IPSEC_MAX_SA_COUNT); return false; } tsa = &ipsec->sa[sa_idx]; if (unlikely(!tsa->used)) { netdev_err(ns->netdev, "unused sa_idx=%d\n", sa_idx); return false; } if (xs->id.proto != IPPROTO_ESP && xs->id.proto != IPPROTO_AH) { netdev_err(ns->netdev, "unexpected proto=%d\n", xs->id.proto); return false; } ipsec->tx++; return true; } void nsim_ipsec_init(struct netdevsim *ns) { ns->netdev->xfrmdev_ops = &nsim_xfrmdev_ops; #define NSIM_ESP_FEATURES (NETIF_F_HW_ESP | \ NETIF_F_HW_ESP_TX_CSUM | \ NETIF_F_GSO_ESP) ns->netdev->features |= NSIM_ESP_FEATURES; ns->netdev->hw_features |= NSIM_ESP_FEATURES; ns->netdev->hw_enc_features |= NSIM_ESP_FEATURES; ns->ipsec.pfile = debugfs_create_file("ipsec", 0400, ns->nsim_dev_port->ddir, ns, &ipsec_dbg_fops); } void nsim_ipsec_teardown(struct netdevsim *ns) { struct nsim_ipsec *ipsec = &ns->ipsec; if (ipsec->count) netdev_err(ns->netdev, "tearing down IPsec offload with %d SAs left\n", ipsec->count); debugfs_remove_recursive(ipsec->pfile); }
3 122 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0 */ /* * kernel/workqueue_internal.h * * Workqueue internal header file. Only to be included by workqueue and * core kernel subsystems. */ #ifndef _KERNEL_WORKQUEUE_INTERNAL_H #define _KERNEL_WORKQUEUE_INTERNAL_H #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/preempt.h> struct worker_pool; /* * The poor guys doing the actual heavy lifting. All on-duty workers are * either serving the manager role, on idle list or on busy hash. For * details on the locking annotation (L, I, X...), refer to workqueue.c. * * Only to be used in workqueue and async. */ struct worker { /* on idle list while idle, on busy hash table while busy */ union { struct list_head entry; /* L: while idle */ struct hlist_node hentry; /* L: while busy */ }; struct work_struct *current_work; /* K: work being processed and its */ work_func_t current_func; /* K: function */ struct pool_workqueue *current_pwq; /* K: pwq */ u64 current_at; /* K: runtime at start or last wakeup */ unsigned int current_color; /* K: color */ int sleeping; /* S: is worker sleeping? */ /* used by the scheduler to determine a worker's last known identity */ work_func_t last_func; /* K: last work's fn */ struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* A: the associated pool */ /* L: for rescuers */ struct list_head node; /* A: anchored at pool->workers */ /* A: runs through worker->node */ unsigned long last_active; /* K: last active timestamp */ unsigned int flags; /* L: flags */ int id; /* I: worker id */ /* * Opaque string set with work_set_desc(). Printed out with task * dump for debugging - WARN, BUG, panic or sysrq. */ char desc[WORKER_DESC_LEN]; /* used only by rescuers to point to the target workqueue */ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ }; /** * current_wq_worker - return struct worker if %current is a workqueue worker */ static inline struct worker *current_wq_worker(void) { if (in_task() && (current->flags & PF_WQ_WORKER)) return kthread_data(current); return NULL; } /* * Scheduler hooks for concurrency managed workqueue. Only to be used from * sched/ and workqueue.c. */ void wq_worker_running(struct task_struct *task); void wq_worker_sleeping(struct task_struct *task); void wq_worker_tick(struct task_struct *task); work_func_t wq_worker_last_func(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
2 2 2 2 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 // SPDX-License-Identifier: GPL-2.0 // rc-ir-raw.c - handle IR pulse/space events // // Copyright (C) 2010 by Mauro Carvalho Chehab #include <linux/export.h> #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/kmod.h> #include <linux/sched.h> #include "rc-core-priv.h" /* Used to keep track of IR raw clients, protected by ir_raw_handler_lock */ static LIST_HEAD(ir_raw_client_list); /* Used to handle IR raw handler extensions */ DEFINE_MUTEX(ir_raw_handler_lock); static LIST_HEAD(ir_raw_handler_list); static atomic64_t available_protocols = ATOMIC64_INIT(0); static int ir_raw_event_thread(void *data) { struct ir_raw_event ev; struct ir_raw_handler *handler; struct ir_raw_event_ctrl *raw = data; struct rc_dev *dev = raw->dev; while (1) { mutex_lock(&ir_raw_handler_lock); while (kfifo_out(&raw->kfifo, &ev, 1)) { if (is_timing_event(ev)) { if (ev.duration == 0) dev_warn_once(&dev->dev, "nonsensical timing event of duration 0"); if (is_timing_event(raw->prev_ev) && !is_transition(&ev, &raw->prev_ev)) dev_warn_once(&dev->dev, "two consecutive events of type %s", TO_STR(ev.pulse)); } list_for_each_entry(handler, &ir_raw_handler_list, list) if (dev->enabled_protocols & handler->protocols || !handler->protocols) handler->decode(dev, ev); lirc_raw_event(dev, ev); raw->prev_ev = ev; } mutex_unlock(&ir_raw_handler_lock); set_current_state(TASK_INTERRUPTIBLE); if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); break; } else if (!kfifo_is_empty(&raw->kfifo)) set_current_state(TASK_RUNNING); schedule(); } return 0; } /** * ir_raw_event_store() - pass a pulse/space duration to the raw ir decoders * @dev: the struct rc_dev device descriptor * @ev: the struct ir_raw_event descriptor of the pulse/space * * This routine (which may be called from an interrupt context) stores a * pulse/space duration for the raw ir decoding state machines. Pulses are * signalled as positive values and spaces as negative values. A zero value * will reset the decoding state machines. */ int ir_raw_event_store(struct rc_dev *dev, struct ir_raw_event *ev) { if (!dev->raw) return -EINVAL; dev_dbg(&dev->dev, "sample: (%05dus %s)\n", ev->duration, TO_STR(ev->pulse)); if (!kfifo_put(&dev->raw->kfifo, *ev)) { dev_err(&dev->dev, "IR event FIFO is full!\n"); return -ENOSPC; } return 0; } EXPORT_SYMBOL_GPL(ir_raw_event_store); /** * ir_raw_event_store_edge() - notify raw ir decoders of the start of a pulse/space * @dev: the struct rc_dev device descriptor * @pulse: true for pulse, false for space * * This routine (which may be called from an interrupt context) is used to * store the beginning of an ir pulse or space (or the start/end of ir * reception) for the raw ir decoding state machines. This is used by * hardware which does not provide durations directly but only interrupts * (or similar events) on state change. */ int ir_raw_event_store_edge(struct rc_dev *dev, bool pulse) { ktime_t now; struct ir_raw_event ev = {}; if (!dev->raw) return -EINVAL; now = ktime_get(); ev.duration = ktime_to_us(ktime_sub(now, dev->raw->last_event)); ev.pulse = !pulse; return ir_raw_event_store_with_timeout(dev, &ev); } EXPORT_SYMBOL_GPL(ir_raw_event_store_edge); /* * ir_raw_event_store_with_timeout() - pass a pulse/space duration to the raw * ir decoders, schedule decoding and * timeout * @dev: the struct rc_dev device descriptor * @ev: the struct ir_raw_event descriptor of the pulse/space * * This routine (which may be called from an interrupt context) stores a * pulse/space duration for the raw ir decoding state machines, schedules * decoding and generates a timeout. */ int ir_raw_event_store_with_timeout(struct rc_dev *dev, struct ir_raw_event *ev) { ktime_t now; int rc = 0; if (!dev->raw) return -EINVAL; now = ktime_get(); spin_lock(&dev->raw->edge_spinlock); rc = ir_raw_event_store(dev, ev); dev->raw->last_event = now; /* timer could be set to timeout (125ms by default) */ if (!timer_pending(&dev->raw->edge_handle) || time_after(dev->raw->edge_handle.expires, jiffies + msecs_to_jiffies(15))) { mod_timer(&dev->raw->edge_handle, jiffies + msecs_to_jiffies(15)); } spin_unlock(&dev->raw->edge_spinlock); return rc; } EXPORT_SYMBOL_GPL(ir_raw_event_store_with_timeout); /** * ir_raw_event_store_with_filter() - pass next pulse/space to decoders with some processing * @dev: the struct rc_dev device descriptor * @ev: the event that has occurred * * This routine (which may be called from an interrupt context) works * in similar manner to ir_raw_event_store_edge. * This routine is intended for devices with limited internal buffer * It automerges samples of same type, and handles timeouts. Returns non-zero * if the event was added, and zero if the event was ignored due to idle * processing. */ int ir_raw_event_store_with_filter(struct rc_dev *dev, struct ir_raw_event *ev) { if (!dev->raw) return -EINVAL; /* Ignore spaces in idle mode */ if (dev->idle && !ev->pulse) return 0; else if (dev->idle) ir_raw_event_set_idle(dev, false); if (!dev->raw->this_ev.duration) dev->raw->this_ev = *ev; else if (ev->pulse == dev->raw->this_ev.pulse) dev->raw->this_ev.duration += ev->duration; else { ir_raw_event_store(dev, &dev->raw->this_ev); dev->raw->this_ev = *ev; } /* Enter idle mode if necessary */ if (!ev->pulse && dev->timeout && dev->raw->this_ev.duration >= dev->timeout) ir_raw_event_set_idle(dev, true); return 1; } EXPORT_SYMBOL_GPL(ir_raw_event_store_with_filter); /** * ir_raw_event_set_idle() - provide hint to rc-core when the device is idle or not * @dev: the struct rc_dev device descriptor * @idle: whether the device is idle or not */ void ir_raw_event_set_idle(struct rc_dev *dev, bool idle) { if (!dev->raw) return; dev_dbg(&dev->dev, "%s idle mode\n", idle ? "enter" : "leave"); if (idle) { dev->raw->this_ev.timeout = true; ir_raw_event_store(dev, &dev->raw->this_ev); dev->raw->this_ev = (struct ir_raw_event) {}; } if (dev->s_idle) dev->s_idle(dev, idle); dev->idle = idle; } EXPORT_SYMBOL_GPL(ir_raw_event_set_idle); /** * ir_raw_event_handle() - schedules the decoding of stored ir data * @dev: the struct rc_dev device descriptor * * This routine will tell rc-core to start decoding stored ir data. */ void ir_raw_event_handle(struct rc_dev *dev) { if (!dev->raw || !dev->raw->thread) return; wake_up_process(dev->raw->thread); } EXPORT_SYMBOL_GPL(ir_raw_event_handle); /* used internally by the sysfs interface */ u64 ir_raw_get_allowed_protocols(void) { return atomic64_read(&available_protocols); } static int change_protocol(struct rc_dev *dev, u64 *rc_proto) { struct ir_raw_handler *handler; u32 timeout = 0; mutex_lock(&ir_raw_handler_lock); list_for_each_entry(handler, &ir_raw_handler_list, list) { if (!(dev->enabled_protocols & handler->protocols) && (*rc_proto & handler->protocols) && handler->raw_register) handler->raw_register(dev); if ((dev->enabled_protocols & handler->protocols) && !(*rc_proto & handler->protocols) && handler->raw_unregister) handler->raw_unregister(dev); } mutex_unlock(&ir_raw_handler_lock); if (!dev->max_timeout) return 0; mutex_lock(&ir_raw_handler_lock); list_for_each_entry(handler, &ir_raw_handler_list, list) { if (handler->protocols & *rc_proto) { if (timeout < handler->min_timeout) timeout = handler->min_timeout; } } mutex_unlock(&ir_raw_handler_lock); if (timeout == 0) timeout = IR_DEFAULT_TIMEOUT; else timeout += MS_TO_US(10); if (timeout < dev->min_timeout) timeout = dev->min_timeout; else if (timeout > dev->max_timeout) timeout = dev->max_timeout; if (dev->s_timeout) dev->s_timeout(dev, timeout); else dev->timeout = timeout; return 0; } static void ir_raw_disable_protocols(struct rc_dev *dev, u64 protocols) { mutex_lock(&dev->lock); dev->enabled_protocols &= ~protocols; mutex_unlock(&dev->lock); } /** * ir_raw_gen_manchester() - Encode data with Manchester (bi-phase) modulation. * @ev: Pointer to pointer to next free event. *@ev is incremented for * each raw event filled. * @max: Maximum number of raw events to fill. * @timings: Manchester modulation timings. * @n: Number of bits of data. * @data: Data bits to encode. * * Encodes the @n least significant bits of @data using Manchester (bi-phase) * modulation with the timing characteristics described by @timings, writing up * to @max raw IR events using the *@ev pointer. * * Returns: 0 on success. * -ENOBUFS if there isn't enough space in the array to fit the * full encoded data. In this case all @max events will have been * written. */ int ir_raw_gen_manchester(struct ir_raw_event **ev, unsigned int max, const struct ir_raw_timings_manchester *timings, unsigned int n, u64 data) { bool need_pulse; u64 i; int ret = -ENOBUFS; i = BIT_ULL(n - 1); if (timings->leader_pulse) { if (!max--) return ret; init_ir_raw_event_duration((*ev), 1, timings->leader_pulse); if (timings->leader_space) { if (!max--) return ret; init_ir_raw_event_duration(++(*ev), 0, timings->leader_space); } } else { /* continue existing signal */ --(*ev); } /* from here on *ev will point to the last event rather than the next */ while (n && i > 0) { need_pulse = !(data & i); if (timings->invert) need_pulse = !need_pulse; if (need_pulse == !!(*ev)->pulse) { (*ev)->duration += timings->clock; } else { if (!max--) goto nobufs; init_ir_raw_event_duration(++(*ev), need_pulse, timings->clock); } if (!max--) goto nobufs; init_ir_raw_event_duration(++(*ev), !need_pulse, timings->clock); i >>= 1; } if (timings->trailer_space) { if (!(*ev)->pulse) (*ev)->duration += timings->trailer_space; else if (!max--) goto nobufs; else init_ir_raw_event_duration(++(*ev), 0, timings->trailer_space); } ret = 0; nobufs: /* point to the next event rather than last event before returning */ ++(*ev); return ret; } EXPORT_SYMBOL(ir_raw_gen_manchester); /** * ir_raw_gen_pd() - Encode data to raw events with pulse-distance modulation. * @ev: Pointer to pointer to next free event. *@ev is incremented for * each raw event filled. * @max: Maximum number of raw events to fill. * @timings: Pulse distance modulation timings. * @n: Number of bits of data. * @data: Data bits to encode. * * Encodes the @n least significant bits of @data using pulse-distance * modulation with the timing characteristics described by @timings, writing up * to @max raw IR events using the *@ev pointer. * * Returns: 0 on success. * -ENOBUFS if there isn't enough space in the array to fit the * full encoded data. In this case all @max events will have been * written. */ int ir_raw_gen_pd(struct ir_raw_event **ev, unsigned int max, const struct ir_raw_timings_pd *timings, unsigned int n, u64 data) { int i; int ret; unsigned int space; if (timings->header_pulse) { ret = ir_raw_gen_pulse_space(ev, &max, timings->header_pulse, timings->header_space); if (ret) return ret; } if (timings->msb_first) { for (i = n - 1; i >= 0; --i) { space = timings->bit_space[(data >> i) & 1]; ret = ir_raw_gen_pulse_space(ev, &max, timings->bit_pulse, space); if (ret) return ret; } } else { for (i = 0; i < n; ++i, data >>= 1) { space = timings->bit_space[data & 1]; ret = ir_raw_gen_pulse_space(ev, &max, timings->bit_pulse, space); if (ret) return ret; } } ret = ir_raw_gen_pulse_space(ev, &max, timings->trailer_pulse, timings->trailer_space); return ret; } EXPORT_SYMBOL(ir_raw_gen_pd); /** * ir_raw_gen_pl() - Encode data to raw events with pulse-length modulation. * @ev: Pointer to pointer to next free event. *@ev is incremented for * each raw event filled. * @max: Maximum number of raw events to fill. * @timings: Pulse distance modulation timings. * @n: Number of bits of data. * @data: Data bits to encode. * * Encodes the @n least significant bits of @data using space-distance * modulation with the timing characteristics described by @timings, writing up * to @max raw IR events using the *@ev pointer. * * Returns: 0 on success. * -ENOBUFS if there isn't enough space in the array to fit the * full encoded data. In this case all @max events will have been * written. */ int ir_raw_gen_pl(struct ir_raw_event **ev, unsigned int max, const struct ir_raw_timings_pl *timings, unsigned int n, u64 data) { int i; int ret = -ENOBUFS; unsigned int pulse; if (!max--) return ret; init_ir_raw_event_duration((*ev)++, 1, timings->header_pulse); if (timings->msb_first) { for (i = n - 1; i >= 0; --i) { if (!max--) return ret; init_ir_raw_event_duration((*ev)++, 0, timings->bit_space); if (!max--) return ret; pulse = timings->bit_pulse[(data >> i) & 1]; init_ir_raw_event_duration((*ev)++, 1, pulse); } } else { for (i = 0; i < n; ++i, data >>= 1) { if (!max--) return ret; init_ir_raw_event_duration((*ev)++, 0, timings->bit_space); if (!max--) return ret; pulse = timings->bit_pulse[data & 1]; init_ir_raw_event_duration((*ev)++, 1, pulse); } } if (!max--) return ret; init_ir_raw_event_duration((*ev)++, 0, timings->trailer_space); return 0; } EXPORT_SYMBOL(ir_raw_gen_pl); /** * ir_raw_encode_scancode() - Encode a scancode as raw events * * @protocol: protocol * @scancode: scancode filter describing a single scancode * @events: array of raw events to write into * @max: max number of raw events * * Attempts to encode the scancode as raw events. * * Returns: The number of events written. * -ENOBUFS if there isn't enough space in the array to fit the * encoding. In this case all @max events will have been written. * -EINVAL if the scancode is ambiguous or invalid, or if no * compatible encoder was found. */ int ir_raw_encode_scancode(enum rc_proto protocol, u32 scancode, struct ir_raw_event *events, unsigned int max) { struct ir_raw_handler *handler; int ret = -EINVAL; u64 mask = 1ULL << protocol; ir_raw_load_modules(&mask); mutex_lock(&ir_raw_handler_lock); list_for_each_entry(handler, &ir_raw_handler_list, list) { if (handler->protocols & mask && handler->encode) { ret = handler->encode(protocol, scancode, events, max); if (ret >= 0 || ret == -ENOBUFS) break; } } mutex_unlock(&ir_raw_handler_lock); return ret; } EXPORT_SYMBOL(ir_raw_encode_scancode); /** * ir_raw_edge_handle() - Handle ir_raw_event_store_edge() processing * * @t: timer_list * * This callback is armed by ir_raw_event_store_edge(). It does two things: * first of all, rather than calling ir_raw_event_handle() for each * edge and waking up the rc thread, 15 ms after the first edge * ir_raw_event_handle() is called. Secondly, generate a timeout event * no more IR is received after the rc_dev timeout. */ static void ir_raw_edge_handle(struct timer_list *t) { struct ir_raw_event_ctrl *raw = timer_container_of(raw, t, edge_handle); struct rc_dev *dev = raw->dev; unsigned long flags; ktime_t interval; spin_lock_irqsave(&dev->raw->edge_spinlock, flags); interval = ktime_sub(ktime_get(), dev->raw->last_event); if (ktime_to_us(interval) >= dev->timeout) { struct ir_raw_event ev = { .timeout = true, .duration = ktime_to_us(interval) }; ir_raw_event_store(dev, &ev); } else { mod_timer(&dev->raw->edge_handle, jiffies + usecs_to_jiffies(dev->timeout - ktime_to_us(interval))); } spin_unlock_irqrestore(&dev->raw->edge_spinlock, flags); ir_raw_event_handle(dev); } /** * ir_raw_encode_carrier() - Get carrier used for protocol * * @protocol: protocol * * Attempts to find the carrier for the specified protocol * * Returns: The carrier in Hz * -EINVAL if the protocol is invalid, or if no * compatible encoder was found. */ int ir_raw_encode_carrier(enum rc_proto protocol) { struct ir_raw_handler *handler; int ret = -EINVAL; u64 mask = BIT_ULL(protocol); mutex_lock(&ir_raw_handler_lock); list_for_each_entry(handler, &ir_raw_handler_list, list) { if (handler->protocols & mask && handler->encode) { ret = handler->carrier; break; } } mutex_unlock(&ir_raw_handler_lock); return ret; } EXPORT_SYMBOL(ir_raw_encode_carrier); /* * Used to (un)register raw event clients */ int ir_raw_event_prepare(struct rc_dev *dev) { if (!dev) return -EINVAL; dev->raw = kzalloc(sizeof(*dev->raw), GFP_KERNEL); if (!dev->raw) return -ENOMEM; dev->raw->dev = dev; dev->change_protocol = change_protocol; dev->idle = true; spin_lock_init(&dev->raw->edge_spinlock); timer_setup(&dev->raw->edge_handle, ir_raw_edge_handle, 0); INIT_KFIFO(dev->raw->kfifo); return 0; } int ir_raw_event_register(struct rc_dev *dev) { struct task_struct *thread; thread = kthread_run(ir_raw_event_thread, dev->raw, "rc%u", dev->minor); if (IS_ERR(thread)) return PTR_ERR(thread); dev->raw->thread = thread; mutex_lock(&ir_raw_handler_lock); list_add_tail(&dev->raw->list, &ir_raw_client_list); mutex_unlock(&ir_raw_handler_lock); return 0; } void ir_raw_event_free(struct rc_dev *dev) { if (!dev) return; kfree(dev->raw); dev->raw = NULL; } void ir_raw_event_unregister(struct rc_dev *dev) { struct ir_raw_handler *handler; if (!dev || !dev->raw) return; kthread_stop(dev->raw->thread); timer_delete_sync(&dev->raw->edge_handle); mutex_lock(&ir_raw_handler_lock); list_del(&dev->raw->list); list_for_each_entry(handler, &ir_raw_handler_list, list) if (handler->raw_unregister && (handler->protocols & dev->enabled_protocols)) handler->raw_unregister(dev); lirc_bpf_free(dev); ir_raw_event_free(dev); /* * A user can be calling bpf(BPF_PROG_{QUERY|ATTACH|DETACH}), so * ensure that the raw member is null on unlock; this is how * "device gone" is checked. */ mutex_unlock(&ir_raw_handler_lock); } /* * Extension interface - used to register the IR decoders */ int ir_raw_handler_register(struct ir_raw_handler *ir_raw_handler) { mutex_lock(&ir_raw_handler_lock); list_add_tail(&ir_raw_handler->list, &ir_raw_handler_list); atomic64_or(ir_raw_handler->protocols, &available_protocols); mutex_unlock(&ir_raw_handler_lock); return 0; } EXPORT_SYMBOL(ir_raw_handler_register); void ir_raw_handler_unregister(struct ir_raw_handler *ir_raw_handler) { struct ir_raw_event_ctrl *raw; u64 protocols = ir_raw_handler->protocols; mutex_lock(&ir_raw_handler_lock); list_del(&ir_raw_handler->list); list_for_each_entry(raw, &ir_raw_client_list, list) { if (ir_raw_handler->raw_unregister && (raw->dev->enabled_protocols & protocols)) ir_raw_handler->raw_unregister(raw->dev); ir_raw_disable_protocols(raw->dev, protocols); } atomic64_andnot(protocols, &available_protocols); mutex_unlock(&ir_raw_handler_lock); } EXPORT_SYMBOL(ir_raw_handler_unregister);
959 175 73 494 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Copyright (C) 2012 Regents of the University of California * Copyright (C) 2017 SiFive */ #ifndef _ASM_RISCV_ATOMIC_H #define _ASM_RISCV_ATOMIC_H #ifdef CONFIG_GENERIC_ATOMIC64 # include <asm-generic/atomic64.h> #else # if (__riscv_xlen < 64) # error "64-bit atomics require XLEN to be at least 64" # endif #endif #include <asm/cmpxchg.h> #define __atomic_acquire_fence() \ __asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory") #define __atomic_release_fence() \ __asm__ __volatile__(RISCV_RELEASE_BARRIER "" ::: "memory"); static __always_inline int arch_atomic_read(const atomic_t *v) { return READ_ONCE(v->counter); } static __always_inline void arch_atomic_set(atomic_t *v, int i) { WRITE_ONCE(v->counter, i); } #ifndef CONFIG_GENERIC_ATOMIC64 #define ATOMIC64_INIT(i) { (i) } static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { return READ_ONCE(v->counter); } static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { WRITE_ONCE(v->counter, i); } #endif /* * First, the atomic ops that have no ordering constraints and therefor don't * have the AQ or RL bits set. These don't return anything, so there's only * one version to worry about. */ #define ATOMIC_OP(op, asm_op, I, asm_type, c_type, prefix) \ static __always_inline \ void arch_atomic##prefix##_##op(c_type i, atomic##prefix##_t *v) \ { \ __asm__ __volatile__ ( \ " amo" #asm_op "." #asm_type " zero, %1, %0" \ : "+A" (v->counter) \ : "r" (I) \ : "memory"); \ } \ #ifdef CONFIG_GENERIC_ATOMIC64 #define ATOMIC_OPS(op, asm_op, I) \ ATOMIC_OP (op, asm_op, I, w, int, ) #else #define ATOMIC_OPS(op, asm_op, I) \ ATOMIC_OP (op, asm_op, I, w, int, ) \ ATOMIC_OP (op, asm_op, I, d, s64, 64) #endif ATOMIC_OPS(add, add, i) ATOMIC_OPS(sub, add, -i) ATOMIC_OPS(and, and, i) ATOMIC_OPS( or, or, i) ATOMIC_OPS(xor, xor, i) #undef ATOMIC_OP #undef ATOMIC_OPS /* * Atomic ops that have ordered, relaxed, acquire, and release variants. * There's two flavors of these: the arithmatic ops have both fetch and return * versions, while the logical ops only have fetch versions. */ #define ATOMIC_FETCH_OP(op, asm_op, I, asm_type, c_type, prefix) \ static __always_inline \ c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i, \ atomic##prefix##_t *v) \ { \ register c_type ret; \ __asm__ __volatile__ ( \ " amo" #asm_op "." #asm_type " %1, %2, %0" \ : "+A" (v->counter), "=r" (ret) \ : "r" (I) \ : "memory"); \ return ret; \ } \ static __always_inline \ c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ { \ register c_type ret; \ __asm__ __volatile__ ( \ " amo" #asm_op "." #asm_type ".aqrl %1, %2, %0" \ : "+A" (v->counter), "=r" (ret) \ : "r" (I) \ : "memory"); \ return ret; \ } #define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_type, c_type, prefix) \ static __always_inline \ c_type arch_atomic##prefix##_##op##_return_relaxed(c_type i, \ atomic##prefix##_t *v) \ { \ return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \ } \ static __always_inline \ c_type arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \ { \ return arch_atomic##prefix##_fetch_##op(i, v) c_op I; \ } #ifdef CONFIG_GENERIC_ATOMIC64 #define ATOMIC_OPS(op, asm_op, c_op, I) \ ATOMIC_FETCH_OP( op, asm_op, I, w, int, ) \ ATOMIC_OP_RETURN(op, asm_op, c_op, I, w, int, ) #else #define ATOMIC_OPS(op, asm_op, c_op, I) \ ATOMIC_FETCH_OP( op, asm_op, I, w, int, ) \ ATOMIC_OP_RETURN(op, asm_op, c_op, I, w, int, ) \ ATOMIC_FETCH_OP( op, asm_op, I, d, s64, 64) \ ATOMIC_OP_RETURN(op, asm_op, c_op, I, d, s64, 64) #endif ATOMIC_OPS(add, add, +, i) ATOMIC_OPS(sub, add, +, -i) #define arch_atomic_add_return_relaxed arch_atomic_add_return_relaxed #define arch_atomic_sub_return_relaxed arch_atomic_sub_return_relaxed #define arch_atomic_add_return arch_atomic_add_return #define arch_atomic_sub_return arch_atomic_sub_return #define arch_atomic_fetch_add_relaxed arch_atomic_fetch_add_relaxed #define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub_relaxed #define arch_atomic_fetch_add arch_atomic_fetch_add #define arch_atomic_fetch_sub arch_atomic_fetch_sub #ifndef CONFIG_GENERIC_ATOMIC64 #define arch_atomic64_add_return_relaxed arch_atomic64_add_return_relaxed #define arch_atomic64_sub_return_relaxed arch_atomic64_sub_return_relaxed #define arch_atomic64_add_return arch_atomic64_add_return #define arch_atomic64_sub_return arch_atomic64_sub_return #define arch_atomic64_fetch_add_relaxed arch_atomic64_fetch_add_relaxed #define arch_atomic64_fetch_sub_relaxed arch_atomic64_fetch_sub_relaxed #define arch_atomic64_fetch_add arch_atomic64_fetch_add #define arch_atomic64_fetch_sub arch_atomic64_fetch_sub #endif #undef ATOMIC_OPS #ifdef CONFIG_GENERIC_ATOMIC64 #define ATOMIC_OPS(op, asm_op, I) \ ATOMIC_FETCH_OP(op, asm_op, I, w, int, ) #else #define ATOMIC_OPS(op, asm_op, I) \ ATOMIC_FETCH_OP(op, asm_op, I, w, int, ) \ ATOMIC_FETCH_OP(op, asm_op, I, d, s64, 64) #endif ATOMIC_OPS(and, and, i) ATOMIC_OPS( or, or, i) ATOMIC_OPS(xor, xor, i) #define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and_relaxed #define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or_relaxed #define arch_atomic_fetch_xor_relaxed arch_atomic_fetch_xor_relaxed #define arch_atomic_fetch_and arch_atomic_fetch_and #define arch_atomic_fetch_or arch_atomic_fetch_or #define arch_atomic_fetch_xor arch_atomic_fetch_xor #ifndef CONFIG_GENERIC_ATOMIC64 #define arch_atomic64_fetch_and_relaxed arch_atomic64_fetch_and_relaxed #define arch_atomic64_fetch_or_relaxed arch_atomic64_fetch_or_relaxed #define arch_atomic64_fetch_xor_relaxed arch_atomic64_fetch_xor_relaxed #define arch_atomic64_fetch_and arch_atomic64_fetch_and #define arch_atomic64_fetch_or arch_atomic64_fetch_or #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor #endif #undef ATOMIC_OPS #undef ATOMIC_FETCH_OP #undef ATOMIC_OP_RETURN #define _arch_atomic_fetch_add_unless(_prev, _rc, counter, _a, _u, sfx) \ ({ \ __asm__ __volatile__ ( \ "0: lr." sfx " %[p], %[c]\n" \ " beq %[p], %[u], 1f\n" \ " add %[rc], %[p], %[a]\n" \ " sc." sfx ".rl %[rc], %[rc], %[c]\n" \ " bnez %[rc], 0b\n" \ " fence rw, rw\n" \ "1:\n" \ : [p]"=&r" (_prev), [rc]"=&r" (_rc), [c]"+A" (counter) \ : [a]"r" (_a), [u]"r" (_u) \ : "memory"); \ }) /* This is required to provide a full barrier on success. */ static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int u) { int prev, rc; _arch_atomic_fetch_add_unless(prev, rc, v->counter, a, u, "w"); return prev; } #define arch_atomic_fetch_add_unless arch_atomic_fetch_add_unless #ifndef CONFIG_GENERIC_ATOMIC64 static __always_inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { s64 prev; long rc; _arch_atomic_fetch_add_unless(prev, rc, v->counter, a, u, "d"); return prev; } #define arch_atomic64_fetch_add_unless arch_atomic64_fetch_add_unless #endif #define _arch_atomic_inc_unless_negative(_prev, _rc, counter, sfx) \ ({ \ __asm__ __volatile__ ( \ "0: lr." sfx " %[p], %[c]\n" \ " bltz %[p], 1f\n" \ " addi %[rc], %[p], 1\n" \ " sc." sfx ".rl %[rc], %[rc], %[c]\n" \ " bnez %[rc], 0b\n" \ " fence rw, rw\n" \ "1:\n" \ : [p]"=&r" (_prev), [rc]"=&r" (_rc), [c]"+A" (counter) \ : \ : "memory"); \ }) static __always_inline bool arch_atomic_inc_unless_negative(atomic_t *v) { int prev, rc; _arch_atomic_inc_unless_negative(prev, rc, v->counter, "w"); return !(prev < 0); } #define arch_atomic_inc_unless_negative arch_atomic_inc_unless_negative #define _arch_atomic_dec_unless_positive(_prev, _rc, counter, sfx) \ ({ \ __asm__ __volatile__ ( \ "0: lr." sfx " %[p], %[c]\n" \ " bgtz %[p], 1f\n" \ " addi %[rc], %[p], -1\n" \ " sc." sfx ".rl %[rc], %[rc], %[c]\n" \ " bnez %[rc], 0b\n" \ " fence rw, rw\n" \ "1:\n" \ : [p]"=&r" (_prev), [rc]"=&r" (_rc), [c]"+A" (counter) \ : \ : "memory"); \ }) static __always_inline bool arch_atomic_dec_unless_positive(atomic_t *v) { int prev, rc; _arch_atomic_dec_unless_positive(prev, rc, v->counter, "w"); return !(prev > 0); } #define arch_atomic_dec_unless_positive arch_atomic_dec_unless_positive #define _arch_atomic_dec_if_positive(_prev, _rc, counter, sfx) \ ({ \ __asm__ __volatile__ ( \ "0: lr." sfx " %[p], %[c]\n" \ " addi %[rc], %[p], -1\n" \ " bltz %[rc], 1f\n" \ " sc." sfx ".rl %[rc], %[rc], %[c]\n" \ " bnez %[rc], 0b\n" \ " fence rw, rw\n" \ "1:\n" \ : [p]"=&r" (_prev), [rc]"=&r" (_rc), [c]"+A" (counter) \ : \ : "memory"); \ }) static __always_inline int arch_atomic_dec_if_positive(atomic_t *v) { int prev, rc; _arch_atomic_dec_if_positive(prev, rc, v->counter, "w"); return prev - 1; } #define arch_atomic_dec_if_positive arch_atomic_dec_if_positive #ifndef CONFIG_GENERIC_ATOMIC64 static __always_inline bool arch_atomic64_inc_unless_negative(atomic64_t *v) { s64 prev; long rc; _arch_atomic_inc_unless_negative(prev, rc, v->counter, "d"); return !(prev < 0); } #define arch_atomic64_inc_unless_negative arch_atomic64_inc_unless_negative static __always_inline bool arch_atomic64_dec_unless_positive(atomic64_t *v) { s64 prev; long rc; _arch_atomic_dec_unless_positive(prev, rc, v->counter, "d"); return !(prev > 0); } #define arch_atomic64_dec_unless_positive arch_atomic64_dec_unless_positive static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) { s64 prev; long rc; _arch_atomic_dec_if_positive(prev, rc, v->counter, "d"); return prev - 1; } #define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive #endif #endif /* _ASM_RISCV_ATOMIC_H */
9 9 9 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 4 5 5 5 4 4 4 4 4 4 3 2 2 3 2 2 25 24 25 25 24 25 9 9 8 9 9 9 9 10 10 10 10 10 10 10 10 10 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 2 2 2 2 2 2 25 23 24 25 25 15 15 10 10 10 10 10 8 8 1 8 8 8 8 10 25 25 25 24 25 24 16 16 16 25 25 25 24 9 25 23 25 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 // SPDX-License-Identifier: GPL-2.0-or-later /* * Information interface for ALSA driver * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/init.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/module.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/info.h> #include <linux/utsname.h> #include <linux/proc_fs.h> #include <linux/mutex.h> int snd_info_check_reserved_words(const char *str) { static const char * const reserved[] = { "version", "meminfo", "memdebug", "detect", "devices", "oss", "cards", "timers", "synth", "pcm", "seq", NULL }; const char * const *xstr = reserved; while (*xstr) { if (!strcmp(*xstr, str)) return 0; xstr++; } if (!strncmp(str, "card", 4)) return 0; return 1; } static DEFINE_MUTEX(info_mutex); struct snd_info_private_data { struct snd_info_buffer *rbuffer; struct snd_info_buffer *wbuffer; struct snd_info_entry *entry; void *file_private_data; }; static int snd_info_version_init(void); static void snd_info_clear_entries(struct snd_info_entry *entry); /* */ static struct snd_info_entry *snd_proc_root; struct snd_info_entry *snd_seq_root; EXPORT_SYMBOL(snd_seq_root); #ifdef CONFIG_SND_OSSEMUL struct snd_info_entry *snd_oss_root; #endif static int alloc_info_private(struct snd_info_entry *entry, struct snd_info_private_data **ret) { struct snd_info_private_data *data; if (!entry || !entry->p) return -ENODEV; if (!try_module_get(entry->module)) return -EFAULT; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) { module_put(entry->module); return -ENOMEM; } data->entry = entry; *ret = data; return 0; } static bool valid_pos(loff_t pos, size_t count) { if (pos < 0 || (long) pos != pos || (ssize_t) count < 0) return false; if ((unsigned long) pos + (unsigned long) count < (unsigned long) pos) return false; return true; } /* * file ops for binary proc files */ static loff_t snd_info_entry_llseek(struct file *file, loff_t offset, int orig) { struct snd_info_private_data *data; struct snd_info_entry *entry; loff_t size; data = file->private_data; entry = data->entry; guard(mutex)(&entry->access); if (entry->c.ops->llseek) return entry->c.ops->llseek(entry, data->file_private_data, file, offset, orig); size = entry->size; switch (orig) { case SEEK_SET: break; case SEEK_CUR: offset += file->f_pos; break; case SEEK_END: if (!size) return -EINVAL; offset += size; break; default: return -EINVAL; } if (offset < 0) return -EINVAL; if (size && offset > size) offset = size; file->f_pos = offset; return offset; } static ssize_t snd_info_entry_read(struct file *file, char __user *buffer, size_t count, loff_t * offset) { struct snd_info_private_data *data = file->private_data; struct snd_info_entry *entry = data->entry; size_t size; loff_t pos; pos = *offset; if (!valid_pos(pos, count)) return -EIO; if (pos >= entry->size) return 0; size = entry->size - pos; size = min(count, size); size = entry->c.ops->read(entry, data->file_private_data, file, buffer, size, pos); if ((ssize_t) size > 0) *offset = pos + size; return size; } static ssize_t snd_info_entry_write(struct file *file, const char __user *buffer, size_t count, loff_t * offset) { struct snd_info_private_data *data = file->private_data; struct snd_info_entry *entry = data->entry; ssize_t size = 0; loff_t pos; pos = *offset; if (!valid_pos(pos, count)) return -EIO; if (count > 0) { size_t maxsize = entry->size - pos; count = min(count, maxsize); size = entry->c.ops->write(entry, data->file_private_data, file, buffer, count, pos); } if (size > 0) *offset = pos + size; return size; } static __poll_t snd_info_entry_poll(struct file *file, poll_table *wait) { struct snd_info_private_data *data = file->private_data; struct snd_info_entry *entry = data->entry; __poll_t mask = 0; if (entry->c.ops->poll) return entry->c.ops->poll(entry, data->file_private_data, file, wait); if (entry->c.ops->read) mask |= EPOLLIN | EPOLLRDNORM; if (entry->c.ops->write) mask |= EPOLLOUT | EPOLLWRNORM; return mask; } static long snd_info_entry_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct snd_info_private_data *data = file->private_data; struct snd_info_entry *entry = data->entry; if (!entry->c.ops->ioctl) return -ENOTTY; return entry->c.ops->ioctl(entry, data->file_private_data, file, cmd, arg); } static int snd_info_entry_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); struct snd_info_private_data *data; struct snd_info_entry *entry; data = file->private_data; if (data == NULL) return 0; entry = data->entry; if (!entry->c.ops->mmap) return -ENXIO; return entry->c.ops->mmap(entry, data->file_private_data, inode, file, vma); } static int snd_info_entry_open(struct inode *inode, struct file *file) { struct snd_info_entry *entry = pde_data(inode); struct snd_info_private_data *data; int mode, err; guard(mutex)(&info_mutex); err = alloc_info_private(entry, &data); if (err < 0) return err; mode = file->f_flags & O_ACCMODE; if (((mode == O_RDONLY || mode == O_RDWR) && !entry->c.ops->read) || ((mode == O_WRONLY || mode == O_RDWR) && !entry->c.ops->write)) { err = -ENODEV; goto error; } if (entry->c.ops->open) { err = entry->c.ops->open(entry, mode, &data->file_private_data); if (err < 0) goto error; } file->private_data = data; return 0; error: kfree(data); module_put(entry->module); return err; } static int snd_info_entry_release(struct inode *inode, struct file *file) { struct snd_info_private_data *data = file->private_data; struct snd_info_entry *entry = data->entry; if (entry->c.ops->release) entry->c.ops->release(entry, file->f_flags & O_ACCMODE, data->file_private_data); module_put(entry->module); kfree(data); return 0; } static const struct proc_ops snd_info_entry_operations = { .proc_lseek = snd_info_entry_llseek, .proc_read = snd_info_entry_read, .proc_write = snd_info_entry_write, .proc_poll = snd_info_entry_poll, .proc_ioctl = snd_info_entry_ioctl, .proc_mmap = snd_info_entry_mmap, .proc_open = snd_info_entry_open, .proc_release = snd_info_entry_release, }; /* * file ops for text proc files */ static ssize_t snd_info_text_entry_write(struct file *file, const char __user *buffer, size_t count, loff_t *offset) { struct seq_file *m = file->private_data; struct snd_info_private_data *data = m->private; struct snd_info_entry *entry = data->entry; struct snd_info_buffer *buf; loff_t pos; size_t next; if (!entry->c.text.write) return -EIO; pos = *offset; if (!valid_pos(pos, count)) return -EIO; next = pos + count; /* don't handle too large text inputs */ if (next > 16 * 1024) return -EIO; guard(mutex)(&entry->access); buf = data->wbuffer; if (!buf) { data->wbuffer = buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) return -ENOMEM; } if (next > buf->len) { char *nbuf = kvzalloc(PAGE_ALIGN(next), GFP_KERNEL); if (!nbuf) return -ENOMEM; kvfree(buf->buffer); buf->buffer = nbuf; buf->len = PAGE_ALIGN(next); } if (copy_from_user(buf->buffer + pos, buffer, count)) return -EFAULT; buf->size = next; *offset = next; return count; } static int snd_info_seq_show(struct seq_file *seq, void *p) { struct snd_info_private_data *data = seq->private; struct snd_info_entry *entry = data->entry; if (!entry->c.text.read) { return -EIO; } else { data->rbuffer->buffer = (char *)seq; /* XXX hack! */ entry->c.text.read(entry, data->rbuffer); } return 0; } static int snd_info_text_entry_open(struct inode *inode, struct file *file) { struct snd_info_entry *entry = pde_data(inode); struct snd_info_private_data *data; int err; guard(mutex)(&info_mutex); err = alloc_info_private(entry, &data); if (err < 0) return err; data->rbuffer = kzalloc(sizeof(*data->rbuffer), GFP_KERNEL); if (!data->rbuffer) { err = -ENOMEM; goto error; } if (entry->size) err = single_open_size(file, snd_info_seq_show, data, entry->size); else err = single_open(file, snd_info_seq_show, data); if (err < 0) goto error; return 0; error: kfree(data->rbuffer); kfree(data); module_put(entry->module); return err; } static int snd_info_text_entry_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; struct snd_info_private_data *data = m->private; struct snd_info_entry *entry = data->entry; if (data->wbuffer && entry->c.text.write) entry->c.text.write(entry, data->wbuffer); single_release(inode, file); kfree(data->rbuffer); if (data->wbuffer) { kvfree(data->wbuffer->buffer); kfree(data->wbuffer); } module_put(entry->module); kfree(data); return 0; } static const struct proc_ops snd_info_text_entry_ops = { .proc_open = snd_info_text_entry_open, .proc_release = snd_info_text_entry_release, .proc_write = snd_info_text_entry_write, .proc_lseek = seq_lseek, .proc_read = seq_read, }; static struct snd_info_entry *create_subdir(struct module *mod, const char *name) { struct snd_info_entry *entry; entry = snd_info_create_module_entry(mod, name, NULL); if (!entry) return NULL; entry->mode = S_IFDIR | 0555; if (snd_info_register(entry) < 0) { snd_info_free_entry(entry); return NULL; } return entry; } static struct snd_info_entry * snd_info_create_entry(const char *name, struct snd_info_entry *parent, struct module *module); int __init snd_info_init(void) { snd_proc_root = snd_info_create_entry("asound", NULL, THIS_MODULE); if (!snd_proc_root) return -ENOMEM; snd_proc_root->mode = S_IFDIR | 0555; snd_proc_root->p = proc_mkdir("asound", NULL); if (!snd_proc_root->p) goto error; #ifdef CONFIG_SND_OSSEMUL snd_oss_root = create_subdir(THIS_MODULE, "oss"); if (!snd_oss_root) goto error; #endif #if IS_ENABLED(CONFIG_SND_SEQUENCER) snd_seq_root = create_subdir(THIS_MODULE, "seq"); if (!snd_seq_root) goto error; #endif if (snd_info_version_init() < 0 || snd_minor_info_init() < 0 || snd_minor_info_oss_init() < 0 || snd_card_info_init() < 0 || snd_info_minor_register() < 0) goto error; return 0; error: snd_info_free_entry(snd_proc_root); return -ENOMEM; } int __exit snd_info_done(void) { snd_info_free_entry(snd_proc_root); return 0; } static void snd_card_id_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_card *card = entry->private_data; snd_iprintf(buffer, "%s\n", card->id); } /* * create a card proc file * called from init.c */ int snd_info_card_create(struct snd_card *card) { char str[8]; struct snd_info_entry *entry; if (snd_BUG_ON(!card)) return -ENXIO; sprintf(str, "card%i", card->number); entry = create_subdir(card->module, str); if (!entry) return -ENOMEM; card->proc_root = entry; return snd_card_ro_proc_new(card, "id", card, snd_card_id_read); } /* * register the card proc file * called from init.c * can be called multiple times for reinitialization */ int snd_info_card_register(struct snd_card *card) { struct proc_dir_entry *p; int err; if (snd_BUG_ON(!card)) return -ENXIO; err = snd_info_register(card->proc_root); if (err < 0) return err; if (!strcmp(card->id, card->proc_root->name)) return 0; if (card->proc_root_link) return 0; p = proc_symlink(card->id, snd_proc_root->p, card->proc_root->name); if (!p) return -ENOMEM; card->proc_root_link = p; return 0; } /* * called on card->id change */ void snd_info_card_id_change(struct snd_card *card) { guard(mutex)(&info_mutex); if (card->proc_root_link) { proc_remove(card->proc_root_link); card->proc_root_link = NULL; } if (strcmp(card->id, card->proc_root->name)) card->proc_root_link = proc_symlink(card->id, snd_proc_root->p, card->proc_root->name); } /* * de-register the card proc file * called from init.c */ void snd_info_card_disconnect(struct snd_card *card) { if (!card) return; proc_remove(card->proc_root_link); if (card->proc_root) proc_remove(card->proc_root->p); guard(mutex)(&info_mutex); if (card->proc_root) snd_info_clear_entries(card->proc_root); card->proc_root_link = NULL; card->proc_root = NULL; } /* * release the card proc file resources * called from init.c */ int snd_info_card_free(struct snd_card *card) { if (!card) return 0; snd_info_free_entry(card->proc_root); card->proc_root = NULL; return 0; } /** * snd_info_get_line - read one line from the procfs buffer * @buffer: the procfs buffer * @line: the buffer to store * @len: the max. buffer size * * Reads one line from the buffer and stores the string. * * Return: Zero if successful, or 1 if error or EOF. */ int snd_info_get_line(struct snd_info_buffer *buffer, char *line, int len) { int c; if (snd_BUG_ON(!buffer)) return 1; if (!buffer->buffer) return 1; if (len <= 0 || buffer->stop || buffer->error) return 1; while (!buffer->stop) { c = buffer->buffer[buffer->curr++]; if (buffer->curr >= buffer->size) buffer->stop = 1; if (c == '\n') break; if (len > 1) { len--; *line++ = c; } } *line = '\0'; return 0; } EXPORT_SYMBOL(snd_info_get_line); /** * snd_info_get_str - parse a string token * @dest: the buffer to store the string token * @src: the original string * @len: the max. length of token - 1 * * Parses the original string and copy a token to the given * string buffer. * * Return: The updated pointer of the original string so that * it can be used for the next call. */ const char *snd_info_get_str(char *dest, const char *src, int len) { int c; while (*src == ' ' || *src == '\t') src++; if (*src == '"' || *src == '\'') { c = *src++; while (--len > 0 && *src && *src != c) { *dest++ = *src++; } if (*src == c) src++; } else { while (--len > 0 && *src && *src != ' ' && *src != '\t') { *dest++ = *src++; } } *dest = 0; while (*src == ' ' || *src == '\t') src++; return src; } EXPORT_SYMBOL(snd_info_get_str); /* * snd_info_create_entry - create an info entry * @name: the proc file name * @parent: the parent directory * * Creates an info entry with the given file name and initializes as * the default state. * * Usually called from other functions such as * snd_info_create_card_entry(). * * Return: The pointer of the new instance, or %NULL on failure. */ static struct snd_info_entry * snd_info_create_entry(const char *name, struct snd_info_entry *parent, struct module *module) { struct snd_info_entry *entry; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (entry == NULL) return NULL; entry->name = kstrdup(name, GFP_KERNEL); if (entry->name == NULL) { kfree(entry); return NULL; } entry->mode = S_IFREG | 0444; entry->content = SNDRV_INFO_CONTENT_TEXT; mutex_init(&entry->access); INIT_LIST_HEAD(&entry->children); INIT_LIST_HEAD(&entry->list); entry->parent = parent; entry->module = module; if (parent) { guard(mutex)(&parent->access); list_add_tail(&entry->list, &parent->children); } return entry; } /** * snd_info_create_module_entry - create an info entry for the given module * @module: the module pointer * @name: the file name * @parent: the parent directory * * Creates a new info entry and assigns it to the given module. * * Return: The pointer of the new instance, or %NULL on failure. */ struct snd_info_entry *snd_info_create_module_entry(struct module * module, const char *name, struct snd_info_entry *parent) { if (!parent) parent = snd_proc_root; return snd_info_create_entry(name, parent, module); } EXPORT_SYMBOL(snd_info_create_module_entry); /** * snd_info_create_card_entry - create an info entry for the given card * @card: the card instance * @name: the file name * @parent: the parent directory * * Creates a new info entry and assigns it to the given card. * * Return: The pointer of the new instance, or %NULL on failure. */ struct snd_info_entry *snd_info_create_card_entry(struct snd_card *card, const char *name, struct snd_info_entry * parent) { if (!parent) parent = card->proc_root; return snd_info_create_entry(name, parent, card->module); } EXPORT_SYMBOL(snd_info_create_card_entry); static void snd_info_clear_entries(struct snd_info_entry *entry) { struct snd_info_entry *p; if (!entry->p) return; list_for_each_entry(p, &entry->children, list) snd_info_clear_entries(p); entry->p = NULL; } /** * snd_info_free_entry - release the info entry * @entry: the info entry * * Releases the info entry. */ void snd_info_free_entry(struct snd_info_entry * entry) { struct snd_info_entry *p, *n; if (!entry) return; if (entry->p) { proc_remove(entry->p); guard(mutex)(&info_mutex); snd_info_clear_entries(entry); } /* free all children at first */ list_for_each_entry_safe(p, n, &entry->children, list) snd_info_free_entry(p); p = entry->parent; if (p) { guard(mutex)(&p->access); list_del(&entry->list); } kfree(entry->name); if (entry->private_free) entry->private_free(entry); kfree(entry); } EXPORT_SYMBOL(snd_info_free_entry); static int __snd_info_register(struct snd_info_entry *entry) { struct proc_dir_entry *root, *p = NULL; if (snd_BUG_ON(!entry)) return -ENXIO; root = entry->parent == NULL ? snd_proc_root->p : entry->parent->p; guard(mutex)(&info_mutex); if (entry->p || !root) return 0; if (S_ISDIR(entry->mode)) { p = proc_mkdir_mode(entry->name, entry->mode, root); if (!p) return -ENOMEM; } else { const struct proc_ops *ops; if (entry->content == SNDRV_INFO_CONTENT_DATA) ops = &snd_info_entry_operations; else ops = &snd_info_text_entry_ops; p = proc_create_data(entry->name, entry->mode, root, ops, entry); if (!p) return -ENOMEM; proc_set_size(p, entry->size); } entry->p = p; return 0; } /** * snd_info_register - register the info entry * @entry: the info entry * * Registers the proc info entry. * The all children entries are registered recursively. * * Return: Zero if successful, or a negative error code on failure. */ int snd_info_register(struct snd_info_entry *entry) { struct snd_info_entry *p; int err; if (!entry->p) { err = __snd_info_register(entry); if (err < 0) return err; } list_for_each_entry(p, &entry->children, list) { err = snd_info_register(p); if (err < 0) return err; } return 0; } EXPORT_SYMBOL(snd_info_register); /** * snd_card_rw_proc_new - Create a read/write text proc file entry for the card * @card: the card instance * @name: the file name * @private_data: the arbitrary private data * @read: the read callback * @write: the write callback, NULL for read-only * * This proc file entry will be registered via snd_card_register() call, and * it will be removed automatically at the card removal, too. * * Return: zero if successful, or a negative error code */ int snd_card_rw_proc_new(struct snd_card *card, const char *name, void *private_data, void (*read)(struct snd_info_entry *, struct snd_info_buffer *), void (*write)(struct snd_info_entry *entry, struct snd_info_buffer *buffer)) { struct snd_info_entry *entry; entry = snd_info_create_card_entry(card, name, card->proc_root); if (!entry) return -ENOMEM; snd_info_set_text_ops(entry, private_data, read); if (write) { entry->mode |= 0200; entry->c.text.write = write; } return 0; } EXPORT_SYMBOL_GPL(snd_card_rw_proc_new); /* */ static void snd_info_version_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { snd_iprintf(buffer, "Advanced Linux Sound Architecture Driver Version k%s.\n", init_utsname()->release); } static int __init snd_info_version_init(void) { struct snd_info_entry *entry; entry = snd_info_create_module_entry(THIS_MODULE, "version", NULL); if (entry == NULL) return -ENOMEM; entry->c.text.read = snd_info_version_read; return snd_info_register(entry); /* freed in error path */ }
51 268 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_DST_OPS_H #define _NET_DST_OPS_H #include <linux/types.h> #include <linux/percpu_counter.h> #include <linux/cache.h> struct dst_entry; struct kmem_cachep; struct net_device; struct sk_buff; struct sock; struct net; struct dst_ops { unsigned short family; unsigned int gc_thresh; void (*gc)(struct dst_ops *ops); struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); unsigned int (*default_advmss)(const struct dst_entry *); unsigned int (*mtu)(const struct dst_entry *); u32 * (*cow_metrics)(struct dst_entry *, unsigned long); void (*destroy)(struct dst_entry *); void (*ifdown)(struct dst_entry *, struct net_device *dev); void (*negative_advice)(struct sock *sk, struct dst_entry *); void (*link_failure)(struct sk_buff *); void (*update_pmtu)(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); void (*redirect)(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); int (*local_out)(struct net *net, struct sock *sk, struct sk_buff *skb); struct neighbour * (*neigh_lookup)(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); void (*confirm_neigh)(const struct dst_entry *dst, const void *daddr); struct kmem_cache *kmem_cachep; struct percpu_counter pcpuc_entries ____cacheline_aligned_in_smp; }; static inline int dst_entries_get_fast(struct dst_ops *dst) { return percpu_counter_read_positive(&dst->pcpuc_entries); } static inline int dst_entries_get_slow(struct dst_ops *dst) { return percpu_counter_sum_positive(&dst->pcpuc_entries); } #define DST_PERCPU_COUNTER_BATCH 32 static inline void dst_entries_add(struct dst_ops *dst, int val) { percpu_counter_add_batch(&dst->pcpuc_entries, val, DST_PERCPU_COUNTER_BATCH); } static inline int dst_entries_init(struct dst_ops *dst) { return percpu_counter_init(&dst->pcpuc_entries, 0, GFP_KERNEL); } static inline void dst_entries_destroy(struct dst_ops *dst) { percpu_counter_destroy(&dst->pcpuc_entries); } #endif
1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/dir.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/dir.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext4 directory handling functions * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * * Hash Tree Directory indexing (c) 2001 Daniel Phillips * */ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/slab.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "ext4.h" #include "xattr.h" static int ext4_dx_readdir(struct file *, struct dir_context *); /** * is_dx_dir() - check if a directory is using htree indexing * @inode: directory inode * * Check if the given dir-inode refers to an htree-indexed directory * (or a directory which could potentially get converted to use htree * indexing). * * Return 1 if it is a dx dir, 0 if not */ static int is_dx_dir(struct inode *inode) { struct super_block *sb = inode->i_sb; if (ext4_has_feature_dir_index(inode->i_sb) && ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || ((inode->i_size >> sb->s_blocksize_bits) == 1) || ext4_has_inline_data(inode))) return 1; return 0; } static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de) { /* Check if . or .. , or skip if namelen is 0 */ if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') && (de->name[1] == '.' || de->name[1] == '\0')) return true; /* Check if this is a csum entry */ if (de->file_type == EXT4_FT_DIR_CSUM) return true; return false; } /* * Return 0 if the directory entry is OK, and 1 if there is a problem * * Note: this is the opposite of what ext2 and ext3 historically returned... * * bh passed here can be an inode block or a dir data block, depending * on the inode inline data flag. */ int __ext4_check_dir_entry(const char *function, unsigned int line, struct inode *dir, struct file *filp, struct ext4_dir_entry_2 *de, struct buffer_head *bh, char *buf, int size, unsigned int offset) { const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); const int next_offset = ((char *) de - buf) + rlen; bool fake = is_fake_dir_entry(de); bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb); if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) error_msg = "rec_len is smaller than minimal"; else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < ext4_dir_rec_len(de->name_len, fake ? NULL : dir))) error_msg = "rec_len is too small for name_len"; else if (unlikely(next_offset > size)) error_msg = "directory entry overrun"; else if (unlikely(next_offset > size - ext4_dir_rec_len(1, has_csum ? NULL : dir) && next_offset != size)) error_msg = "directory entry too close to block end"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; else if (unlikely(next_offset == size && de->name_len == 1 && de->name[0] == '.')) error_msg = "'.' directory cannot be the last in data block"; else return 0; if (filp) ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), rlen, size, fake); else ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), rlen, size, fake); return 1; } static int ext4_readdir(struct file *file, struct dir_context *ctx) { unsigned int offset; int i; struct ext4_dir_entry_2 *de; int err; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); struct dir_private_info *info = file->private_data; err = fscrypt_prepare_readdir(inode); if (err) return err; if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); if (err != ERR_BAD_DX_DIR) return err; /* Can we just clear INDEX flag to ignore htree information? */ if (!ext4_has_feature_metadata_csum(sb)) { /* * We don't set the inode dirty flag since it's not * critical that it gets flushed back to the disk. */ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } } if (ext4_has_inline_data(inode)) { int has_inline_data = 1; err = ext4_read_inline_dir(file, ctx, &has_inline_data); if (has_inline_data) return err; } if (IS_ENCRYPTED(inode)) { err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr); if (err < 0) return err; } while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto errout; } cond_resched(); offset = ctx->pos & (sb->s_blocksize - 1); map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; err = ext4_map_blocks(NULL, inode, &map, 0); if (err == 0) { /* m_len should never be zero but let's avoid * an infinite loop if it somehow is */ if (map.m_len == 0) map.m_len = 1; ctx->pos += map.m_len * sb->s_blocksize; continue; } if (err > 0) { pgoff_t index = map.m_pblk << inode->i_blkbits >> PAGE_SHIFT; if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_mapping, &file->f_ra, file, index, 1 << EXT4_SB(sb)->s_min_folio_order); file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); if (IS_ERR(bh)) { err = PTR_ERR(bh); bh = NULL; goto errout; } } if (!bh) { /* corrupt size? Maybe no more blocks to read */ if (ctx->pos > inode->i_blocks << 9) break; ctx->pos += sb->s_blocksize - offset; continue; } /* Check the checksum */ if (!buffer_verified(bh) && !ext4_dirblock_csum_verify(inode, bh)) { EXT4_ERROR_FILE(file, 0, "directory fails checksum " "at offset %llu", (unsigned long long)ctx->pos); ctx->pos += sb->s_blocksize - offset; brelse(bh); bh = NULL; continue; } set_buffer_verified(bh); /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ if (!inode_eq_iversion(inode, info->cookie)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at * least that it is non-zero. A * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) < ext4_dir_rec_len(1, inode)) break; i += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } offset = i; ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; info->cookie = inode_query_iversion(inode); } while (ctx->pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); if (ext4_check_dir_entry(inode, file, de, bh, bh->b_data, bh->b_size, offset)) { /* * On error, skip to the next block */ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; break; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { if (!IS_ENCRYPTED(inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } else { int save_len = fstr.len; struct fscrypt_str de_name = FSTR_INIT(de->name, de->name_len); u32 hash; u32 minor_hash; if (IS_CASEFOLDED(inode)) { hash = EXT4_DIRENT_HASH(de); minor_hash = EXT4_DIRENT_MINOR_HASH(de); } else { hash = 0; minor_hash = 0; } /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(inode, hash, minor_hash, &de_name, &fstr); de_name = fstr; fstr.len = save_len; if (err) goto errout; if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } } ctx->pos += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } if ((ctx->pos < inode->i_size) && !dir_relax_shared(inode)) goto done; brelse(bh); bh = NULL; } done: err = 0; errout: fscrypt_fname_free_buffer(&fstr); brelse(bh); return err; } static inline int is_32bit_api(void) { #ifdef CONFIG_COMPAT return in_compat_syscall(); #else return (BITS_PER_LONG == 32); #endif } /* * These functions convert from the major/minor hash to an f_pos * value for dx directories * * Upper layer (for example NFS) should specify FMODE_32BITHASH or * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted * directly on both 32-bit and 64-bit nodes, under such case, neither * FMODE_32BITHASH nor FMODE_64BITHASH is specified. */ static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return major >> 1; else return ((__u64)(major >> 1) << 32) | (__u64)minor; } static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return (pos << 1) & 0xffffffff; else return ((pos >> 32) << 1) & 0xffffffff; } static inline __u32 pos2min_hash(struct file *filp, loff_t pos) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return 0; else return pos & 0xffffffff; } /* * Return 32- or 64-bit end-of-file for dx directories */ static inline loff_t ext4_get_htree_eof(struct file *filp) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return EXT4_HTREE_EOF_32BIT; else return EXT4_HTREE_EOF_64BIT; } /* * ext4_dir_llseek() calls generic_file_llseek_size to handle htree * directories, where the "offset" is in terms of the filename hash * value instead of the byte offset. * * Because we may return a 64-bit hash that is well beyond offset limits, * we need to pass the max hash as the maximum allowable offset in * the htree directory case. * * For non-htree, ext4_llseek already chooses the proper max offset. */ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; struct dir_private_info *info = file->private_data; int dx_dir = is_dx_dir(inode); loff_t ret, htree_max = ext4_get_htree_eof(file); if (likely(dx_dir)) ret = generic_file_llseek_size(file, offset, whence, htree_max, htree_max); else ret = ext4_llseek(file, offset, whence); info->cookie = inode_peek_iversion(inode) - 1; return ret; } /* * This structure holds the nodes of the red-black tree used to store * the directory entry in hash order. */ struct fname { __u32 hash; __u32 minor_hash; struct rb_node rb_hash; struct fname *next; __u32 inode; __u8 name_len; __u8 file_type; char name[] __counted_by(name_len); }; /* * This function implements a non-recursive way of freeing all of the * nodes in the red-black tree. */ static void free_rb_tree_fname(struct rb_root *root) { struct fname *fname, *next; rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) while (fname) { struct fname *old = fname; fname = fname->next; kfree(old); } *root = RB_ROOT; } static void ext4_htree_init_dir_info(struct file *filp, loff_t pos) { struct dir_private_info *p = filp->private_data; if (is_dx_dir(file_inode(filp)) && !p->initialized) { p->curr_hash = pos2maj_hash(filp, pos); p->curr_minor_hash = pos2min_hash(filp, pos); p->initialized = true; } } void ext4_htree_free_dir_info(struct dir_private_info *p) { free_rb_tree_fname(&p->root); kfree(p); } /* * Given a directory entry, enter it into the fname rb tree. * * When filename encryption is enabled, the dirent will hold the * encrypted filename, while the htree will hold decrypted filename. * The decrypted filename is passed in via ent_name. parameter. */ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, struct fscrypt_str *ent_name) { struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; struct dir_private_info *info; info = dir_file->private_data; p = &info->root.rb_node; /* Create and allocate the fname structure */ new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1), GFP_KERNEL); if (!new_fn) return -ENOMEM; new_fn->hash = hash; new_fn->minor_hash = minor_hash; new_fn->inode = le32_to_cpu(dirent->inode); new_fn->name_len = ent_name->len; new_fn->file_type = dirent->file_type; memcpy(new_fn->name, ent_name->name, ent_name->len); while (*p) { parent = *p; fname = rb_entry(parent, struct fname, rb_hash); /* * If the hash and minor hash match up, then we put * them on a linked list. This rarely happens... */ if ((new_fn->hash == fname->hash) && (new_fn->minor_hash == fname->minor_hash)) { new_fn->next = fname->next; fname->next = new_fn; return 0; } if (new_fn->hash < fname->hash) p = &(*p)->rb_left; else if (new_fn->hash > fname->hash) p = &(*p)->rb_right; else if (new_fn->minor_hash < fname->minor_hash) p = &(*p)->rb_left; else /* if (new_fn->minor_hash > fname->minor_hash) */ p = &(*p)->rb_right; } rb_link_node(&new_fn->rb_hash, parent, p); rb_insert_color(&new_fn->rb_hash, &info->root); return 0; } /* * This is a helper function for ext4_dx_readdir. It calls filldir * for all entries on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ static int call_filldir(struct file *file, struct dir_context *ctx, struct fname *fname) { struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; if (!fname) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " "called with null fname?!?", __func__, __LINE__, inode->i_ino, current->comm); return 0; } ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); while (fname) { if (!dir_emit(ctx, fname->name, fname->name_len, fname->inode, get_dtype(sb, fname->file_type))) { info->extra_fname = fname; return 1; } fname = fname->next; } return 0; } static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) { struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct fname *fname; int ret = 0; ext4_htree_init_dir_info(file, ctx->pos); if (ctx->pos == ext4_get_htree_eof(file)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ if (info->last_pos != ctx->pos) { free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; info->curr_hash = pos2maj_hash(file, ctx->pos); info->curr_minor_hash = pos2min_hash(file, ctx->pos); } /* * If there are any leftover names on the hash collision * chain, return them first. */ if (info->extra_fname) { if (call_filldir(file, ctx, info->extra_fname)) goto finished; info->extra_fname = NULL; goto next_node; } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { /* * Fill the rbtree if we have no more entries, * or the inode has changed since we last read in the * cached entries. */ if ((!info->curr_node) || !inode_eq_iversion(inode, info->cookie)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); info->cookie = inode_query_iversion(inode); ret = ext4_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); if (ret < 0) goto finished; if (ret == 0) { ctx->pos = ext4_get_htree_eof(file); break; } info->curr_node = rb_first(&info->root); } fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; if (call_filldir(file, ctx, fname)) break; next_node: info->curr_node = rb_next(info->curr_node); if (info->curr_node) { fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { ctx->pos = ext4_get_htree_eof(file); break; } info->curr_hash = info->next_hash; info->curr_minor_hash = 0; } } finished: info->last_pos = ctx->pos; return ret < 0 ? ret : 0; } static int ext4_release_dir(struct inode *inode, struct file *filp) { if (filp->private_data) ext4_htree_free_dir_info(filp->private_data); return 0; } int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size) { struct ext4_dir_entry_2 *de; int rlen; unsigned int offset = 0; char *top; de = buf; top = buf + buf_size; while ((char *) de < top) { if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } if ((char *) de > top) return -EFSCORRUPTED; return 0; } static int ext4_dir_open(struct inode *inode, struct file *file) { struct dir_private_info *info; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; file->private_data = info; return 0; } const struct file_operations ext4_dir_operations = { .open = ext4_dir_open, .llseek = ext4_dir_llseek, .read = generic_read_dir, .iterate_shared = ext4_readdir, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif .fsync = ext4_sync_file, .release = ext4_release_dir, };
26 25 25 24 23 22 4 19 25 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 // SPDX-License-Identifier: GPL-2.0-only /* String matching match for iptables * * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net> */ #include <linux/gfp.h> #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_string.h> #include <linux/textsearch.h> MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>"); MODULE_DESCRIPTION("Xtables: string-based matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_string"); MODULE_ALIAS("ip6t_string"); MODULE_ALIAS("ebt_string"); static bool string_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_string_info *conf = par->matchinfo; bool invert; invert = conf->u.v1.flags & XT_STRING_FLAG_INVERT; return (skb_find_text((struct sk_buff *)skb, conf->from_offset, conf->to_offset, conf->config) != UINT_MAX) ^ invert; } #define STRING_TEXT_PRIV(m) ((struct xt_string_info *)(m)) static int string_mt_check(const struct xt_mtchk_param *par) { struct xt_string_info *conf = par->matchinfo; struct ts_config *ts_conf; int flags = TS_AUTOLOAD; /* Damn, can't handle this case properly with iptables... */ if (conf->from_offset > conf->to_offset) return -EINVAL; if (conf->algo[XT_STRING_MAX_ALGO_NAME_SIZE - 1] != '\0') return -EINVAL; if (conf->patlen > XT_STRING_MAX_PATTERN_SIZE) return -EINVAL; if (conf->u.v1.flags & ~(XT_STRING_FLAG_IGNORECASE | XT_STRING_FLAG_INVERT)) return -EINVAL; if (conf->u.v1.flags & XT_STRING_FLAG_IGNORECASE) flags |= TS_IGNORECASE; ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen, GFP_KERNEL, flags); if (IS_ERR(ts_conf)) return PTR_ERR(ts_conf); conf->config = ts_conf; return 0; } static void string_mt_destroy(const struct xt_mtdtor_param *par) { textsearch_destroy(STRING_TEXT_PRIV(par->matchinfo)->config); } static struct xt_match xt_string_mt_reg __read_mostly = { .name = "string", .revision = 1, .family = NFPROTO_UNSPEC, .checkentry = string_mt_check, .match = string_mt, .destroy = string_mt_destroy, .matchsize = sizeof(struct xt_string_info), .usersize = offsetof(struct xt_string_info, config), .me = THIS_MODULE, }; static int __init string_mt_init(void) { return xt_register_match(&xt_string_mt_reg); } static void __exit string_mt_exit(void) { xt_unregister_match(&xt_string_mt_reg); } module_init(string_mt_init); module_exit(string_mt_exit);
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 // SPDX-License-Identifier: GPL-2.0-only /* * vsock sock_diag(7) module * * Copyright (C) 2017 Red Hat, Inc. * Author: Stefan Hajnoczi <stefanha@redhat.com> */ #include <linux/module.h> #include <linux/sock_diag.h> #include <linux/vm_sockets_diag.h> #include <net/af_vsock.h> static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, u32 portid, u32 seq, u32 flags) { struct vsock_sock *vsk = vsock_sk(sk); struct vsock_diag_msg *rep; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), flags); if (!nlh) return -EMSGSIZE; rep = nlmsg_data(nlh); rep->vdiag_family = AF_VSOCK; /* Lock order dictates that sk_lock is acquired before * vsock_table_lock, so we cannot lock here. Simply don't take * sk_lock; sk is guaranteed to stay alive since vsock_table_lock is * held. */ rep->vdiag_type = sk->sk_type; rep->vdiag_state = sk->sk_state; rep->vdiag_shutdown = sk->sk_shutdown; rep->vdiag_src_cid = vsk->local_addr.svm_cid; rep->vdiag_src_port = vsk->local_addr.svm_port; rep->vdiag_dst_cid = vsk->remote_addr.svm_cid; rep->vdiag_dst_port = vsk->remote_addr.svm_port; rep->vdiag_ino = sock_i_ino(sk); sock_diag_save_cookie(sk, rep->vdiag_cookie); return 0; } static int vsock_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct vsock_diag_req *req; struct vsock_sock *vsk; unsigned int bucket; unsigned int last_i; unsigned int table; struct net *net; unsigned int i; req = nlmsg_data(cb->nlh); net = sock_net(skb->sk); /* State saved between calls: */ table = cb->args[0]; bucket = cb->args[1]; i = last_i = cb->args[2]; /* TODO VMCI pending sockets? */ spin_lock_bh(&vsock_table_lock); /* Bind table (locally created sockets) */ if (table == 0) { while (bucket < ARRAY_SIZE(vsock_bind_table)) { struct list_head *head = &vsock_bind_table[bucket]; i = 0; list_for_each_entry(vsk, head, bound_table) { struct sock *sk = sk_vsock(vsk); if (!net_eq(sock_net(sk), net)) continue; if (i < last_i) goto next_bind; if (!(req->vdiag_states & (1 << sk->sk_state))) goto next_bind; if (sk_diag_fill(sk, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) goto done; next_bind: i++; } last_i = 0; bucket++; } table++; bucket = 0; } /* Connected table (accepted connections) */ while (bucket < ARRAY_SIZE(vsock_connected_table)) { struct list_head *head = &vsock_connected_table[bucket]; i = 0; list_for_each_entry(vsk, head, connected_table) { struct sock *sk = sk_vsock(vsk); /* Skip sockets we've already seen above */ if (__vsock_in_bound_table(vsk)) continue; if (!net_eq(sock_net(sk), net)) continue; if (i < last_i) goto next_connected; if (!(req->vdiag_states & (1 << sk->sk_state))) goto next_connected; if (sk_diag_fill(sk, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) goto done; next_connected: i++; } last_i = 0; bucket++; } done: spin_unlock_bh(&vsock_table_lock); cb->args[0] = table; cb->args[1] = bucket; cb->args[2] = i; return skb->len; } static int vsock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct vsock_diag_req); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = vsock_diag_dump, }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } return -EOPNOTSUPP; } static const struct sock_diag_handler vsock_diag_handler = { .owner = THIS_MODULE, .family = AF_VSOCK, .dump = vsock_diag_handler_dump, }; static int __init vsock_diag_init(void) { return sock_diag_register(&vsock_diag_handler); } static void __exit vsock_diag_exit(void) { sock_diag_unregister(&vsock_diag_handler); } module_init(vsock_diag_init); module_exit(vsock_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("VMware Virtual Sockets monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 40 /* AF_VSOCK */);
18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 // SPDX-License-Identifier: GPL-2.0 /* * This module exports the functions: * * 'int set_selection_user(struct tiocl_selection __user *, * struct tty_struct *)' * 'int set_selection_kernel(struct tiocl_selection *, struct tty_struct *)' * 'void clear_selection(void)' * 'int paste_selection(struct tty_struct *)' * 'int sel_loadlut(u32 __user *)' * * Now that /dev/vcs exists, most of this can disappear again. */ #include <linux/module.h> #include <linux/tty.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/uaccess.h> #include <linux/kbd_kern.h> #include <linux/vt_kern.h> #include <linux/consolemap.h> #include <linux/selection.h> #include <linux/tiocl.h> #include <linux/console.h> #include <linux/tty_flip.h> #include <linux/sched/signal.h> /* Don't take this from <ctype.h>: 011-015 on the screen aren't spaces */ #define is_space_on_vt(c) ((c) == ' ') /* FIXME: all this needs locking */ static struct vc_selection { struct mutex lock; struct vc_data *cons; /* must not be deallocated */ char *buffer; unsigned int buf_len; volatile int start; /* cleared by clear_selection */ int end; } vc_sel = { .lock = __MUTEX_INITIALIZER(vc_sel.lock), .start = -1, }; /* clear_selection, highlight and highlight_pointer can be called from interrupt (via scrollback/front) */ /* set reverse video on characters s-e of console with selection. */ static inline void highlight(const int s, const int e) { invert_screen(vc_sel.cons, s, e-s+2, true); } /* use complementary color to show the pointer */ static inline void highlight_pointer(const int where) { complement_pos(vc_sel.cons, where); } static u32 sel_pos(int n, bool unicode) { if (unicode) return screen_glyph_unicode(vc_sel.cons, n / 2); return inverse_translate(vc_sel.cons, screen_glyph(vc_sel.cons, n), false); } /** * clear_selection - remove current selection * * Remove the current selection highlight, if any from the console holding the * selection. * * Locking: The caller must hold the console lock. */ void clear_selection(void) { highlight_pointer(-1); /* hide the pointer */ if (vc_sel.start != -1) { highlight(vc_sel.start, vc_sel.end); vc_sel.start = -1; } } EXPORT_SYMBOL_GPL(clear_selection); bool vc_is_sel(const struct vc_data *vc) { return vc == vc_sel.cons; } /* * User settable table: what characters are to be considered alphabetic? * 128 bits. Locked by the console lock. */ static u32 inwordLut[]={ 0x00000000, /* control chars */ 0x03FFE000, /* digits and "-./" */ 0x87FFFFFE, /* uppercase and '_' */ 0x07FFFFFE, /* lowercase */ }; static inline int inword(const u32 c) { return c > 0x7f || (( inwordLut[c>>5] >> (c & 0x1F) ) & 1); } /** * sel_loadlut() - load the LUT table * @lut: user table * * Load the LUT table from user space. Make a temporary copy so a partial * update doesn't make a mess. * * Locking: The console lock is acquired. */ int sel_loadlut(u32 __user *lut) { u32 tmplut[ARRAY_SIZE(inwordLut)]; if (copy_from_user(tmplut, lut, sizeof(inwordLut))) return -EFAULT; guard(console_lock)(); memcpy(inwordLut, tmplut, sizeof(inwordLut)); return 0; } /* does screen address p correspond to character at LH/RH edge of screen? */ static inline int atedge(const int p, int size_row) { return (!(p % size_row) || !((p + 2) % size_row)); } /* stores the char in UTF8 and returns the number of bytes used (1-4) */ static int store_utf8(u32 c, char *p) { if (c < 0x80) { /* 0******* */ p[0] = c; return 1; } else if (c < 0x800) { /* 110***** 10****** */ p[0] = 0xc0 | (c >> 6); p[1] = 0x80 | (c & 0x3f); return 2; } else if (c < 0x10000) { /* 1110**** 10****** 10****** */ p[0] = 0xe0 | (c >> 12); p[1] = 0x80 | ((c >> 6) & 0x3f); p[2] = 0x80 | (c & 0x3f); return 3; } else if (c < 0x110000) { /* 11110*** 10****** 10****** 10****** */ p[0] = 0xf0 | (c >> 18); p[1] = 0x80 | ((c >> 12) & 0x3f); p[2] = 0x80 | ((c >> 6) & 0x3f); p[3] = 0x80 | (c & 0x3f); return 4; } else { /* outside Unicode, replace with U+FFFD */ p[0] = 0xef; p[1] = 0xbf; p[2] = 0xbd; return 3; } } /** * set_selection_user - set the current selection. * @sel: user selection info * @tty: the console tty * * Invoked by the ioctl handle for the vt layer. * * Locking: The entire selection process is managed under the console_lock. * It's a lot under the lock but its hardly a performance path. */ int set_selection_user(const struct tiocl_selection __user *sel, struct tty_struct *tty) { struct tiocl_selection v; if (copy_from_user(&v, sel, sizeof(*sel))) return -EFAULT; /* * TIOCL_SELCLEAR and TIOCL_SELPOINTER are OK to use without * CAP_SYS_ADMIN as they do not modify the selection. */ switch (v.sel_mode) { case TIOCL_SELCLEAR: case TIOCL_SELPOINTER: break; default: if (!capable(CAP_SYS_ADMIN)) return -EPERM; } return set_selection_kernel(&v, tty); } static int vc_selection_store_chars(struct vc_data *vc, bool unicode) { char *bp, *obp; unsigned int i; /* Allocate a new buffer before freeing the old one ... */ /* chars can take up to 4 bytes with unicode */ bp = kmalloc_array((vc_sel.end - vc_sel.start) / 2 + 1, unicode ? 4 : 1, GFP_KERNEL | __GFP_NOWARN); if (!bp) { printk(KERN_WARNING "selection: kmalloc() failed\n"); clear_selection(); return -ENOMEM; } kfree(vc_sel.buffer); vc_sel.buffer = bp; obp = bp; for (i = vc_sel.start; i <= vc_sel.end; i += 2) { u32 c = sel_pos(i, unicode); if (unicode) bp += store_utf8(c, bp); else *bp++ = c; if (!is_space_on_vt(c)) obp = bp; if (!((i + 2) % vc->vc_size_row)) { /* strip trailing blanks from line and add newline, unless non-space at end of line. */ if (obp != bp) { bp = obp; *bp++ = '\r'; } obp = bp; } } vc_sel.buf_len = bp - vc_sel.buffer; return 0; } static int vc_do_selection(struct vc_data *vc, unsigned short mode, int ps, int pe) { int new_sel_start, new_sel_end, spc; bool unicode = vt_do_kdgkbmode(fg_console) == K_UNICODE; switch (mode) { case TIOCL_SELCHAR: /* character-by-character selection */ new_sel_start = ps; new_sel_end = pe; break; case TIOCL_SELWORD: /* word-by-word selection */ spc = is_space_on_vt(sel_pos(ps, unicode)); for (new_sel_start = ps; ; ps -= 2) { if ((spc && !is_space_on_vt(sel_pos(ps, unicode))) || (!spc && !inword(sel_pos(ps, unicode)))) break; new_sel_start = ps; if (!(ps % vc->vc_size_row)) break; } spc = is_space_on_vt(sel_pos(pe, unicode)); for (new_sel_end = pe; ; pe += 2) { if ((spc && !is_space_on_vt(sel_pos(pe, unicode))) || (!spc && !inword(sel_pos(pe, unicode)))) break; new_sel_end = pe; if (!((pe + 2) % vc->vc_size_row)) break; } break; case TIOCL_SELLINE: /* line-by-line selection */ new_sel_start = rounddown(ps, vc->vc_size_row); new_sel_end = rounddown(pe, vc->vc_size_row) + vc->vc_size_row - 2; break; case TIOCL_SELPOINTER: highlight_pointer(pe); return 0; default: return -EINVAL; } /* remove the pointer */ highlight_pointer(-1); /* select to end of line if on trailing space */ if (new_sel_end > new_sel_start && !atedge(new_sel_end, vc->vc_size_row) && is_space_on_vt(sel_pos(new_sel_end, unicode))) { for (pe = new_sel_end + 2; ; pe += 2) if (!is_space_on_vt(sel_pos(pe, unicode)) || atedge(pe, vc->vc_size_row)) break; if (is_space_on_vt(sel_pos(pe, unicode))) new_sel_end = pe; } if (vc_sel.start == -1) /* no current selection */ highlight(new_sel_start, new_sel_end); else if (new_sel_start == vc_sel.start) { if (new_sel_end == vc_sel.end) /* no action required */ return 0; else if (new_sel_end > vc_sel.end) /* extend to right */ highlight(vc_sel.end + 2, new_sel_end); else /* contract from right */ highlight(new_sel_end + 2, vc_sel.end); } else if (new_sel_end == vc_sel.end) { if (new_sel_start < vc_sel.start) /* extend to left */ highlight(new_sel_start, vc_sel.start - 2); else /* contract from left */ highlight(vc_sel.start, new_sel_start - 2); } else /* some other case; start selection from scratch */ { clear_selection(); highlight(new_sel_start, new_sel_end); } vc_sel.start = new_sel_start; vc_sel.end = new_sel_end; return vc_selection_store_chars(vc, unicode); } static int vc_selection(struct vc_data *vc, struct tiocl_selection *v, struct tty_struct *tty) { int ps, pe; poke_blanked_console(); if (v->sel_mode == TIOCL_SELCLEAR) { /* useful for screendump without selection highlights */ clear_selection(); return 0; } /* Historically 0 => max value */ v->xs = umin(v->xs - 1, vc->vc_cols - 1); v->ys = umin(v->ys - 1, vc->vc_rows - 1); v->xe = umin(v->xe - 1, vc->vc_cols - 1); v->ye = umin(v->ye - 1, vc->vc_rows - 1); if (mouse_reporting() && (v->sel_mode & TIOCL_SELMOUSEREPORT)) { mouse_report(tty, v->sel_mode & TIOCL_SELBUTTONMASK, v->xs, v->ys); return 0; } ps = v->ys * vc->vc_size_row + (v->xs << 1); pe = v->ye * vc->vc_size_row + (v->xe << 1); if (ps > pe) /* make vc_sel.start <= vc_sel.end */ swap(ps, pe); if (vc_sel.cons != vc) { clear_selection(); vc_sel.cons = vc; } return vc_do_selection(vc, v->sel_mode, ps, pe); } int set_selection_kernel(struct tiocl_selection *v, struct tty_struct *tty) { guard(mutex)(&vc_sel.lock); guard(console_lock)(); return vc_selection(vc_cons[fg_console].d, v, tty); } EXPORT_SYMBOL_GPL(set_selection_kernel); /* Insert the contents of the selection buffer into the * queue of the tty associated with the current console. * Invoked by ioctl(). * * Locking: called without locks. Calls the ldisc wrongly with * unsafe methods, */ int paste_selection(struct tty_struct *tty) { struct vc_data *vc = tty->driver_data; int pasted = 0; size_t count; struct tty_ldisc *ld; DECLARE_WAITQUEUE(wait, current); int ret = 0; bool bp = vc->vc_bracketed_paste; static const char bracketed_paste_start[] = "\033[200~"; static const char bracketed_paste_end[] = "\033[201~"; const char *bps = bp ? bracketed_paste_start : NULL; const char *bpe = bp ? bracketed_paste_end : NULL; scoped_guard(console_lock) poke_blanked_console(); ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; /* ldisc was hung up */ tty_buffer_lock_exclusive(&vc->port); add_wait_queue(&vc->paste_wait, &wait); mutex_lock(&vc_sel.lock); while (vc_sel.buffer && (vc_sel.buf_len > pasted || bpe)) { set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) { ret = -EINTR; break; } if (tty_throttled(tty)) { mutex_unlock(&vc_sel.lock); schedule(); mutex_lock(&vc_sel.lock); continue; } __set_current_state(TASK_RUNNING); if (bps) { bps += tty_ldisc_receive_buf(ld, bps, NULL, strlen(bps)); if (*bps != '\0') continue; bps = NULL; } count = vc_sel.buf_len - pasted; if (count) { pasted += tty_ldisc_receive_buf(ld, vc_sel.buffer + pasted, NULL, count); if (vc_sel.buf_len > pasted) continue; } if (bpe) { bpe += tty_ldisc_receive_buf(ld, bpe, NULL, strlen(bpe)); if (*bpe == '\0') bpe = NULL; } } mutex_unlock(&vc_sel.lock); remove_wait_queue(&vc->paste_wait, &wait); __set_current_state(TASK_RUNNING); tty_buffer_unlock_exclusive(&vc->port); tty_ldisc_deref(ld); return ret; } EXPORT_SYMBOL_GPL(paste_selection);
6 3 1 3 3 3 3 2 2 2 3 4 4 4 4 4 6 6 4 6 6 6 5 6 6 4 6 6 6 6 6 6 6 6 6 6 6 2 2 6 6 6 5 6 5 5 5 5 6 6 6 6 6 6 6 4 4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic MIDI synth driver for ALSA sequencer * Copyright (c) 1998 by Frank van de Pol <fvdpol@coil.demon.nl> * Jaroslav Kysela <perex@perex.cz> */ /* Possible options for midisynth module: - automatic opening of midi ports on first received event or subscription (close will be performed when client leaves) */ #include <linux/init.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/module.h> #include <linux/mutex.h> #include <sound/core.h> #include <sound/rawmidi.h> #include <sound/seq_kernel.h> #include <sound/seq_device.h> #include <sound/seq_midi_event.h> #include <sound/initval.h> MODULE_AUTHOR("Frank van de Pol <fvdpol@coil.demon.nl>, Jaroslav Kysela <perex@perex.cz>"); MODULE_DESCRIPTION("Advanced Linux Sound Architecture sequencer MIDI synth."); MODULE_LICENSE("GPL"); static int output_buffer_size = PAGE_SIZE; module_param(output_buffer_size, int, 0644); MODULE_PARM_DESC(output_buffer_size, "Output buffer size in bytes."); static int input_buffer_size = PAGE_SIZE; module_param(input_buffer_size, int, 0644); MODULE_PARM_DESC(input_buffer_size, "Input buffer size in bytes."); /* data for this midi synth driver */ struct seq_midisynth { struct snd_card *card; struct snd_rawmidi *rmidi; int device; int subdevice; struct snd_rawmidi_file input_rfile; struct snd_rawmidi_file output_rfile; int seq_client; int seq_port; struct snd_midi_event *parser; }; struct seq_midisynth_client { int seq_client; int num_ports; int ports_per_device[SNDRV_RAWMIDI_DEVICES]; struct seq_midisynth *ports[SNDRV_RAWMIDI_DEVICES]; }; static struct seq_midisynth_client *synths[SNDRV_CARDS]; static DEFINE_MUTEX(register_mutex); /* handle rawmidi input event (MIDI v1.0 stream) */ static void snd_midi_input_event(struct snd_rawmidi_substream *substream) { struct snd_rawmidi_runtime *runtime; struct seq_midisynth *msynth; struct snd_seq_event ev; char buf[16], *pbuf; long res; if (substream == NULL) return; runtime = substream->runtime; msynth = runtime->private_data; if (msynth == NULL) return; memset(&ev, 0, sizeof(ev)); while (runtime->avail > 0) { res = snd_rawmidi_kernel_read(substream, buf, sizeof(buf)); if (res <= 0) continue; if (msynth->parser == NULL) continue; pbuf = buf; while (res-- > 0) { if (!snd_midi_event_encode_byte(msynth->parser, *pbuf++, &ev)) continue; ev.source.port = msynth->seq_port; ev.dest.client = SNDRV_SEQ_ADDRESS_SUBSCRIBERS; snd_seq_kernel_client_dispatch(msynth->seq_client, &ev, 1, 0); /* clear event and reset header */ memset(&ev, 0, sizeof(ev)); } } } static int dump_midi(struct snd_rawmidi_substream *substream, const char *buf, int count) { struct snd_rawmidi_runtime *runtime; int tmp; if (snd_BUG_ON(!substream || !buf)) return -EINVAL; runtime = substream->runtime; tmp = runtime->avail; if (tmp < count) { if (printk_ratelimit()) pr_err("ALSA: seq_midi: MIDI output buffer overrun\n"); return -ENOMEM; } if (snd_rawmidi_kernel_write(substream, buf, count) < count) return -EINVAL; return 0; } /* callback for snd_seq_dump_var_event(), bridging to dump_midi() */ static int __dump_midi(void *ptr, void *buf, int count) { return dump_midi(ptr, buf, count); } static int event_process_midi(struct snd_seq_event *ev, int direct, void *private_data, int atomic, int hop) { struct seq_midisynth *msynth = private_data; unsigned char msg[10]; /* buffer for constructing midi messages */ struct snd_rawmidi_substream *substream; int len; if (snd_BUG_ON(!msynth)) return -EINVAL; substream = msynth->output_rfile.output; if (substream == NULL) return -ENODEV; if (ev->type == SNDRV_SEQ_EVENT_SYSEX) { /* special case, to save space */ if ((ev->flags & SNDRV_SEQ_EVENT_LENGTH_MASK) != SNDRV_SEQ_EVENT_LENGTH_VARIABLE) { /* invalid event */ pr_debug("ALSA: seq_midi: invalid sysex event flags = 0x%x\n", ev->flags); return 0; } snd_seq_dump_var_event(ev, __dump_midi, substream); snd_midi_event_reset_decode(msynth->parser); } else { if (msynth->parser == NULL) return -EIO; len = snd_midi_event_decode(msynth->parser, msg, sizeof(msg), ev); if (len < 0) return 0; if (dump_midi(substream, msg, len) < 0) snd_midi_event_reset_decode(msynth->parser); } return 0; } static int snd_seq_midisynth_new(struct seq_midisynth *msynth, struct snd_card *card, int device, int subdevice) { if (snd_midi_event_new(MAX_MIDI_EVENT_BUF, &msynth->parser) < 0) return -ENOMEM; msynth->card = card; msynth->device = device; msynth->subdevice = subdevice; return 0; } /* open associated midi device for input */ static int midisynth_subscribe(void *private_data, struct snd_seq_port_subscribe *info) { int err; struct seq_midisynth *msynth = private_data; struct snd_rawmidi_runtime *runtime; struct snd_rawmidi_params params; /* open midi port */ err = snd_rawmidi_kernel_open(msynth->rmidi, msynth->subdevice, SNDRV_RAWMIDI_LFLG_INPUT, &msynth->input_rfile); if (err < 0) { pr_debug("ALSA: seq_midi: midi input open failed!!!\n"); return err; } runtime = msynth->input_rfile.input->runtime; memset(&params, 0, sizeof(params)); params.avail_min = 1; params.buffer_size = input_buffer_size; err = snd_rawmidi_input_params(msynth->input_rfile.input, &params); if (err < 0) { snd_rawmidi_kernel_release(&msynth->input_rfile); return err; } snd_midi_event_reset_encode(msynth->parser); runtime->event = snd_midi_input_event; runtime->private_data = msynth; snd_rawmidi_kernel_read(msynth->input_rfile.input, NULL, 0); return 0; } /* close associated midi device for input */ static int midisynth_unsubscribe(void *private_data, struct snd_seq_port_subscribe *info) { int err; struct seq_midisynth *msynth = private_data; if (snd_BUG_ON(!msynth->input_rfile.input)) return -EINVAL; err = snd_rawmidi_kernel_release(&msynth->input_rfile); return err; } /* open associated midi device for output */ static int midisynth_use(void *private_data, struct snd_seq_port_subscribe *info) { int err; struct seq_midisynth *msynth = private_data; struct snd_rawmidi_params params; /* open midi port */ err = snd_rawmidi_kernel_open(msynth->rmidi, msynth->subdevice, SNDRV_RAWMIDI_LFLG_OUTPUT, &msynth->output_rfile); if (err < 0) { pr_debug("ALSA: seq_midi: midi output open failed!!!\n"); return err; } memset(&params, 0, sizeof(params)); params.avail_min = 1; params.buffer_size = output_buffer_size; params.no_active_sensing = 1; err = snd_rawmidi_output_params(msynth->output_rfile.output, &params); if (err < 0) { snd_rawmidi_kernel_release(&msynth->output_rfile); return err; } snd_midi_event_reset_decode(msynth->parser); return 0; } /* close associated midi device for output */ static int midisynth_unuse(void *private_data, struct snd_seq_port_subscribe *info) { struct seq_midisynth *msynth = private_data; if (snd_BUG_ON(!msynth->output_rfile.output)) return -EINVAL; snd_rawmidi_drain_output(msynth->output_rfile.output); return snd_rawmidi_kernel_release(&msynth->output_rfile); } /* delete given midi synth port */ static void snd_seq_midisynth_delete(struct seq_midisynth *msynth) { if (msynth == NULL) return; if (msynth->seq_client > 0) { /* delete port */ snd_seq_event_port_detach(msynth->seq_client, msynth->seq_port); } snd_midi_event_free(msynth->parser); } /* register new midi synth port */ static int snd_seq_midisynth_probe(struct device *_dev) { struct snd_seq_device *dev = to_seq_dev(_dev); struct seq_midisynth_client *client; struct seq_midisynth *msynth, *ms; struct snd_seq_port_info *port __free(kfree) = NULL; struct snd_rawmidi_info *info __free(kfree) = NULL; struct snd_rawmidi *rmidi = dev->private_data; int newclient = 0; unsigned int p, ports; struct snd_seq_port_callback pcallbacks; struct snd_card *card = dev->card; int device = dev->device; unsigned int input_count = 0, output_count = 0; if (snd_BUG_ON(!card || device < 0 || device >= SNDRV_RAWMIDI_DEVICES)) return -EINVAL; info = kmalloc(sizeof(*info), GFP_KERNEL); if (! info) return -ENOMEM; info->device = device; info->stream = SNDRV_RAWMIDI_STREAM_OUTPUT; info->subdevice = 0; if (snd_rawmidi_info_select(card, info) >= 0) output_count = info->subdevices_count; info->stream = SNDRV_RAWMIDI_STREAM_INPUT; if (snd_rawmidi_info_select(card, info) >= 0) { input_count = info->subdevices_count; } ports = output_count; if (ports < input_count) ports = input_count; if (ports == 0) return -ENODEV; if (ports > (256 / SNDRV_RAWMIDI_DEVICES)) ports = 256 / SNDRV_RAWMIDI_DEVICES; guard(mutex)(&register_mutex); client = synths[card->number]; if (client == NULL) { newclient = 1; client = kzalloc(sizeof(*client), GFP_KERNEL); if (client == NULL) return -ENOMEM; client->seq_client = snd_seq_create_kernel_client( card, 0, "%s", card->shortname[0] ? (const char *)card->shortname : "External MIDI"); if (client->seq_client < 0) { kfree(client); return -ENOMEM; } } msynth = kcalloc(ports, sizeof(struct seq_midisynth), GFP_KERNEL); port = kmalloc(sizeof(*port), GFP_KERNEL); if (msynth == NULL || port == NULL) goto __nomem; for (p = 0; p < ports; p++) { ms = &msynth[p]; ms->rmidi = rmidi; if (snd_seq_midisynth_new(ms, card, device, p) < 0) goto __nomem; /* declare port */ memset(port, 0, sizeof(*port)); port->addr.client = client->seq_client; port->addr.port = device * (256 / SNDRV_RAWMIDI_DEVICES) + p; port->flags = SNDRV_SEQ_PORT_FLG_GIVEN_PORT; memset(info, 0, sizeof(*info)); info->device = device; if (p < output_count) info->stream = SNDRV_RAWMIDI_STREAM_OUTPUT; else info->stream = SNDRV_RAWMIDI_STREAM_INPUT; info->subdevice = p; if (snd_rawmidi_info_select(card, info) >= 0) strscpy(port->name, info->subname); if (! port->name[0]) { if (info->name[0]) { if (ports > 1) scnprintf(port->name, sizeof(port->name), "%s-%u", info->name, p); else scnprintf(port->name, sizeof(port->name), "%s", info->name); } else { /* last resort */ if (ports > 1) sprintf(port->name, "MIDI %d-%d-%u", card->number, device, p); else sprintf(port->name, "MIDI %d-%d", card->number, device); } } if ((info->flags & SNDRV_RAWMIDI_INFO_OUTPUT) && p < output_count) port->capability |= SNDRV_SEQ_PORT_CAP_WRITE | SNDRV_SEQ_PORT_CAP_SYNC_WRITE | SNDRV_SEQ_PORT_CAP_SUBS_WRITE; if ((info->flags & SNDRV_RAWMIDI_INFO_INPUT) && p < input_count) port->capability |= SNDRV_SEQ_PORT_CAP_READ | SNDRV_SEQ_PORT_CAP_SYNC_READ | SNDRV_SEQ_PORT_CAP_SUBS_READ; if ((port->capability & (SNDRV_SEQ_PORT_CAP_WRITE|SNDRV_SEQ_PORT_CAP_READ)) == (SNDRV_SEQ_PORT_CAP_WRITE|SNDRV_SEQ_PORT_CAP_READ) && info->flags & SNDRV_RAWMIDI_INFO_DUPLEX) port->capability |= SNDRV_SEQ_PORT_CAP_DUPLEX; if (port->capability & SNDRV_SEQ_PORT_CAP_READ) port->direction |= SNDRV_SEQ_PORT_DIR_INPUT; if (port->capability & SNDRV_SEQ_PORT_CAP_WRITE) port->direction |= SNDRV_SEQ_PORT_DIR_OUTPUT; port->type = SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC | SNDRV_SEQ_PORT_TYPE_HARDWARE | SNDRV_SEQ_PORT_TYPE_PORT; port->midi_channels = 16; memset(&pcallbacks, 0, sizeof(pcallbacks)); pcallbacks.owner = THIS_MODULE; pcallbacks.private_data = ms; pcallbacks.subscribe = midisynth_subscribe; pcallbacks.unsubscribe = midisynth_unsubscribe; pcallbacks.use = midisynth_use; pcallbacks.unuse = midisynth_unuse; pcallbacks.event_input = event_process_midi; port->kernel = &pcallbacks; if (rmidi->ops && rmidi->ops->get_port_info) rmidi->ops->get_port_info(rmidi, p, port); if (snd_seq_kernel_client_ctl(client->seq_client, SNDRV_SEQ_IOCTL_CREATE_PORT, port)<0) goto __nomem; ms->seq_client = client->seq_client; ms->seq_port = port->addr.port; } client->ports_per_device[device] = ports; client->ports[device] = msynth; client->num_ports++; if (newclient) synths[card->number] = client; return 0; /* success */ __nomem: if (msynth != NULL) { for (p = 0; p < ports; p++) snd_seq_midisynth_delete(&msynth[p]); kfree(msynth); } if (newclient) { snd_seq_delete_kernel_client(client->seq_client); kfree(client); } return -ENOMEM; } /* release midi synth port */ static int snd_seq_midisynth_remove(struct device *_dev) { struct snd_seq_device *dev = to_seq_dev(_dev); struct seq_midisynth_client *client; struct seq_midisynth *msynth; struct snd_card *card = dev->card; int device = dev->device, p, ports; guard(mutex)(&register_mutex); client = synths[card->number]; if (client == NULL || client->ports[device] == NULL) return -ENODEV; ports = client->ports_per_device[device]; client->ports_per_device[device] = 0; msynth = client->ports[device]; client->ports[device] = NULL; for (p = 0; p < ports; p++) snd_seq_midisynth_delete(&msynth[p]); kfree(msynth); client->num_ports--; if (client->num_ports <= 0) { snd_seq_delete_kernel_client(client->seq_client); synths[card->number] = NULL; kfree(client); } return 0; } static struct snd_seq_driver seq_midisynth_driver = { .driver = { .name = KBUILD_MODNAME, .probe = snd_seq_midisynth_probe, .remove = snd_seq_midisynth_remove, }, .id = SNDRV_SEQ_DEV_ID_MIDISYNTH, .argsize = 0, }; module_snd_seq_driver(seq_midisynth_driver);
151 1 151 150 151 151 151 3472 500 3480 2755 3474 1330 956 1341 1327 1328 1329 2911 2910 2698 2911 338 338 331 339 334 38 38 315 336 227 228 227 226 228 226 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 // SPDX-License-Identifier: GPL-2.0-or-later #define pr_fmt(fmt) "ref_tracker: " fmt #include <linux/export.h> #include <linux/list_sort.h> #include <linux/ref_tracker.h> #include <linux/slab.h> #include <linux/stacktrace.h> #include <linux/stackdepot.h> #include <linux/seq_file.h> #define REF_TRACKER_STACK_ENTRIES 16 #define STACK_BUF_SIZE 1024 struct ref_tracker { struct list_head head; /* anchor into dir->list or dir->quarantine */ bool dead; depot_stack_handle_t alloc_stack_handle; depot_stack_handle_t free_stack_handle; }; struct ref_tracker_dir_stats { int total; int count; struct { depot_stack_handle_t stack_handle; unsigned int count; } stacks[]; }; #ifdef CONFIG_DEBUG_FS #include <linux/xarray.h> /* * ref_tracker_dir_init() is usually called in allocation-safe contexts, but * the same is not true of ref_tracker_dir_exit() which can be called from * anywhere an object is freed. Removing debugfs dentries is a blocking * operation, so we defer that work to the debugfs_reap_worker. * * Each dentry is tracked in the appropriate xarray. When * ref_tracker_dir_exit() is called, its entries in the xarrays are marked and * the workqueue job is scheduled. The worker then runs and deletes any marked * dentries asynchronously. */ static struct xarray debugfs_dentries; static struct xarray debugfs_symlinks; static struct work_struct debugfs_reap_worker; #define REF_TRACKER_DIR_DEAD XA_MARK_0 static inline void ref_tracker_debugfs_mark(struct ref_tracker_dir *dir) { unsigned long flags; xa_lock_irqsave(&debugfs_dentries, flags); __xa_set_mark(&debugfs_dentries, (unsigned long)dir, REF_TRACKER_DIR_DEAD); xa_unlock_irqrestore(&debugfs_dentries, flags); xa_lock_irqsave(&debugfs_symlinks, flags); __xa_set_mark(&debugfs_symlinks, (unsigned long)dir, REF_TRACKER_DIR_DEAD); xa_unlock_irqrestore(&debugfs_symlinks, flags); schedule_work(&debugfs_reap_worker); } #else static inline void ref_tracker_debugfs_mark(struct ref_tracker_dir *dir) { } #endif static struct ref_tracker_dir_stats * ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit) { struct ref_tracker_dir_stats *stats; struct ref_tracker *tracker; stats = kmalloc(struct_size(stats, stacks, limit), GFP_NOWAIT); if (!stats) return ERR_PTR(-ENOMEM); stats->total = 0; stats->count = 0; list_for_each_entry(tracker, &dir->list, head) { depot_stack_handle_t stack = tracker->alloc_stack_handle; int i; ++stats->total; for (i = 0; i < stats->count; ++i) if (stats->stacks[i].stack_handle == stack) break; if (i >= limit) continue; if (i >= stats->count) { stats->stacks[i].stack_handle = stack; stats->stacks[i].count = 0; ++stats->count; } ++stats->stacks[i].count; } return stats; } struct ostream { void __ostream_printf (*func)(struct ostream *stream, char *fmt, ...); char *prefix; char *buf; struct seq_file *seq; int size, used; }; static void __ostream_printf pr_ostream_log(struct ostream *stream, char *fmt, ...) { va_list args; va_start(args, fmt); vprintk(fmt, args); va_end(args); } static void __ostream_printf pr_ostream_buf(struct ostream *stream, char *fmt, ...) { int ret, len = stream->size - stream->used; va_list args; va_start(args, fmt); ret = vsnprintf(stream->buf + stream->used, len, fmt, args); va_end(args); if (ret > 0) stream->used += min(ret, len); } #define pr_ostream(stream, fmt, args...) \ ({ \ struct ostream *_s = (stream); \ \ _s->func(_s, fmt, ##args); \ }) static void __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir, unsigned int display_limit, struct ostream *s) { struct ref_tracker_dir_stats *stats; unsigned int i = 0, skipped; depot_stack_handle_t stack; char *sbuf; lockdep_assert_held(&dir->lock); if (list_empty(&dir->list)) return; stats = ref_tracker_get_stats(dir, display_limit); if (IS_ERR(stats)) { pr_ostream(s, "%s%s@%p: couldn't get stats, error %pe\n", s->prefix, dir->class, dir, stats); return; } sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT); for (i = 0, skipped = stats->total; i < stats->count; ++i) { stack = stats->stacks[i].stack_handle; if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4)) sbuf[0] = 0; pr_ostream(s, "%s%s@%p has %d/%d users at\n%s\n", s->prefix, dir->class, dir, stats->stacks[i].count, stats->total, sbuf); skipped -= stats->stacks[i].count; } if (skipped) pr_ostream(s, "%s%s@%p skipped reports about %d/%d users.\n", s->prefix, dir->class, dir, skipped, stats->total); kfree(sbuf); kfree(stats); } void ref_tracker_dir_print_locked(struct ref_tracker_dir *dir, unsigned int display_limit) { struct ostream os = { .func = pr_ostream_log, .prefix = "ref_tracker: " }; __ref_tracker_dir_pr_ostream(dir, display_limit, &os); } EXPORT_SYMBOL(ref_tracker_dir_print_locked); void ref_tracker_dir_print(struct ref_tracker_dir *dir, unsigned int display_limit) { unsigned long flags; spin_lock_irqsave(&dir->lock, flags); ref_tracker_dir_print_locked(dir, display_limit); spin_unlock_irqrestore(&dir->lock, flags); } EXPORT_SYMBOL(ref_tracker_dir_print); int ref_tracker_dir_snprint(struct ref_tracker_dir *dir, char *buf, size_t size) { struct ostream os = { .func = pr_ostream_buf, .prefix = "ref_tracker: ", .buf = buf, .size = size }; unsigned long flags; spin_lock_irqsave(&dir->lock, flags); __ref_tracker_dir_pr_ostream(dir, 16, &os); spin_unlock_irqrestore(&dir->lock, flags); return os.used; } EXPORT_SYMBOL(ref_tracker_dir_snprint); void ref_tracker_dir_exit(struct ref_tracker_dir *dir) { struct ref_tracker *tracker, *n; unsigned long flags; bool leak = false; dir->dead = true; /* * The xarray entries must be marked before the dir->lock is taken to * protect simultaneous debugfs readers. */ ref_tracker_debugfs_mark(dir); spin_lock_irqsave(&dir->lock, flags); list_for_each_entry_safe(tracker, n, &dir->quarantine, head) { list_del(&tracker->head); kfree(tracker); dir->quarantine_avail++; } if (!list_empty(&dir->list)) { ref_tracker_dir_print_locked(dir, 16); leak = true; list_for_each_entry_safe(tracker, n, &dir->list, head) { list_del(&tracker->head); kfree(tracker); } } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(leak); WARN_ON_ONCE(refcount_read(&dir->untracked) != 1); WARN_ON_ONCE(refcount_read(&dir->no_tracker) != 1); } EXPORT_SYMBOL(ref_tracker_dir_exit); int ref_tracker_alloc(struct ref_tracker_dir *dir, struct ref_tracker **trackerp, gfp_t gfp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; struct ref_tracker *tracker; unsigned int nr_entries; gfp_t gfp_mask = gfp | __GFP_NOWARN; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_inc(&dir->no_tracker); return 0; } if (gfp & __GFP_DIRECT_RECLAIM) gfp_mask |= __GFP_NOFAIL; *trackerp = tracker = kzalloc(sizeof(*tracker), gfp_mask); if (unlikely(!tracker)) { pr_err_once("memory allocation failure, unreliable refcount tracker.\n"); refcount_inc(&dir->untracked); return -ENOMEM; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); tracker->alloc_stack_handle = stack_depot_save(entries, nr_entries, gfp); spin_lock_irqsave(&dir->lock, flags); list_add(&tracker->head, &dir->list); spin_unlock_irqrestore(&dir->lock, flags); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_alloc); int ref_tracker_free(struct ref_tracker_dir *dir, struct ref_tracker **trackerp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; depot_stack_handle_t stack_handle; struct ref_tracker *tracker; unsigned int nr_entries; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_dec(&dir->no_tracker); return 0; } tracker = *trackerp; if (!tracker) { refcount_dec(&dir->untracked); return -EEXIST; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); stack_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); spin_lock_irqsave(&dir->lock, flags); if (tracker->dead) { pr_err("reference already released.\n"); if (tracker->alloc_stack_handle) { pr_err("allocated in:\n"); stack_depot_print(tracker->alloc_stack_handle); } if (tracker->free_stack_handle) { pr_err("freed in:\n"); stack_depot_print(tracker->free_stack_handle); } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(1); return -EINVAL; } tracker->dead = true; tracker->free_stack_handle = stack_handle; list_move_tail(&tracker->head, &dir->quarantine); if (!dir->quarantine_avail) { tracker = list_first_entry(&dir->quarantine, struct ref_tracker, head); list_del(&tracker->head); } else { dir->quarantine_avail--; tracker = NULL; } spin_unlock_irqrestore(&dir->lock, flags); kfree(tracker); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_free); #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> static struct dentry *ref_tracker_debug_dir = (struct dentry *)-ENOENT; static void __ostream_printf pr_ostream_seq(struct ostream *stream, char *fmt, ...) { va_list args; va_start(args, fmt); seq_vprintf(stream->seq, fmt, args); va_end(args); } static int ref_tracker_dir_seq_print(struct ref_tracker_dir *dir, struct seq_file *seq) { struct ostream os = { .func = pr_ostream_seq, .prefix = "", .seq = seq }; __ref_tracker_dir_pr_ostream(dir, 16, &os); return os.used; } static int ref_tracker_debugfs_show(struct seq_file *f, void *v) { struct ref_tracker_dir *dir = f->private; unsigned long index = (unsigned long)dir; unsigned long flags; int ret; /* * "dir" may not exist at this point if ref_tracker_dir_exit() has * already been called. Take care not to dereference it until its * legitimacy is established. * * The xa_lock is necessary to ensure that "dir" doesn't disappear * before its lock can be taken. If it's in the hash and not marked * dead, then it's safe to take dir->lock which prevents * ref_tracker_dir_exit() from completing. Once the dir->lock is * acquired, the xa_lock can be released. All of this must be IRQ-safe. */ xa_lock_irqsave(&debugfs_dentries, flags); if (!xa_load(&debugfs_dentries, index) || xa_get_mark(&debugfs_dentries, index, REF_TRACKER_DIR_DEAD)) { xa_unlock_irqrestore(&debugfs_dentries, flags); return -ENODATA; } spin_lock(&dir->lock); xa_unlock(&debugfs_dentries); ret = ref_tracker_dir_seq_print(dir, f); spin_unlock_irqrestore(&dir->lock, flags); return ret; } static int ref_tracker_debugfs_open(struct inode *inode, struct file *filp) { struct ref_tracker_dir *dir = inode->i_private; return single_open(filp, ref_tracker_debugfs_show, dir); } static const struct file_operations ref_tracker_debugfs_fops = { .owner = THIS_MODULE, .open = ref_tracker_debugfs_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; /** * ref_tracker_dir_debugfs - create debugfs file for ref_tracker_dir * @dir: ref_tracker_dir to be associated with debugfs file * * In most cases, a debugfs file will be created automatically for every * ref_tracker_dir. If the object was created before debugfs is brought up * then that may fail. In those cases, it is safe to call this at a later * time to create the file. */ void ref_tracker_dir_debugfs(struct ref_tracker_dir *dir) { char name[NAME_MAX + 1]; struct dentry *dentry; int ret; /* No-op if already created */ dentry = xa_load(&debugfs_dentries, (unsigned long)dir); if (dentry && !xa_is_err(dentry)) return; ret = snprintf(name, sizeof(name), "%s@%p", dir->class, dir); name[sizeof(name) - 1] = '\0'; if (ret < sizeof(name)) { dentry = debugfs_create_file(name, S_IFREG | 0400, ref_tracker_debug_dir, dir, &ref_tracker_debugfs_fops); if (!IS_ERR(dentry)) { void *old; old = xa_store_irq(&debugfs_dentries, (unsigned long)dir, dentry, GFP_KERNEL); if (xa_is_err(old)) debugfs_remove(dentry); else WARN_ON_ONCE(old); } } } EXPORT_SYMBOL(ref_tracker_dir_debugfs); void __ostream_printf ref_tracker_dir_symlink(struct ref_tracker_dir *dir, const char *fmt, ...) { char name[NAME_MAX + 1]; struct dentry *symlink, *dentry; va_list args; int ret; symlink = xa_load(&debugfs_symlinks, (unsigned long)dir); dentry = xa_load(&debugfs_dentries, (unsigned long)dir); /* Already created?*/ if (symlink && !xa_is_err(symlink)) return; if (!dentry || xa_is_err(dentry)) return; va_start(args, fmt); ret = vsnprintf(name, sizeof(name), fmt, args); va_end(args); name[sizeof(name) - 1] = '\0'; if (ret < sizeof(name)) { symlink = debugfs_create_symlink(name, ref_tracker_debug_dir, dentry->d_name.name); if (!IS_ERR(symlink)) { void *old; old = xa_store_irq(&debugfs_symlinks, (unsigned long)dir, symlink, GFP_KERNEL); if (xa_is_err(old)) debugfs_remove(symlink); else WARN_ON_ONCE(old); } } } EXPORT_SYMBOL(ref_tracker_dir_symlink); static void debugfs_reap_work(struct work_struct *work) { struct dentry *dentry; unsigned long index; bool reaped; do { reaped = false; xa_for_each_marked(&debugfs_symlinks, index, dentry, REF_TRACKER_DIR_DEAD) { xa_erase_irq(&debugfs_symlinks, index); debugfs_remove(dentry); reaped = true; } xa_for_each_marked(&debugfs_dentries, index, dentry, REF_TRACKER_DIR_DEAD) { xa_erase_irq(&debugfs_dentries, index); debugfs_remove(dentry); reaped = true; } } while (reaped); } static int __init ref_tracker_debugfs_postcore_init(void) { INIT_WORK(&debugfs_reap_worker, debugfs_reap_work); xa_init_flags(&debugfs_dentries, XA_FLAGS_LOCK_IRQ); xa_init_flags(&debugfs_symlinks, XA_FLAGS_LOCK_IRQ); return 0; } postcore_initcall(ref_tracker_debugfs_postcore_init); static int __init ref_tracker_debugfs_late_init(void) { ref_tracker_debug_dir = debugfs_create_dir("ref_tracker", NULL); return 0; } late_initcall(ref_tracker_debugfs_late_init); #endif /* CONFIG_DEBUG_FS */
130 106 106 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CGROUP_NAMESPACE_H #define _LINUX_CGROUP_NAMESPACE_H #include <linux/ns_common.h> struct cgroup_namespace { struct ns_common ns; struct user_namespace *user_ns; struct ucounts *ucounts; struct css_set *root_cset; }; extern struct cgroup_namespace init_cgroup_ns; #ifdef CONFIG_CGROUPS static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) { return container_of(ns, struct cgroup_namespace, ns); } void free_cgroup_ns(struct cgroup_namespace *ns); struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns); int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); static inline void get_cgroup_ns(struct cgroup_namespace *ns) { ns_ref_inc(ns); } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { if (ns_ref_put(ns)) free_cgroup_ns(ns); } #else /* !CONFIG_CGROUPS */ static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } static inline struct cgroup_namespace * copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { return old_ns; } static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } #endif /* !CONFIG_CGROUPS */ #endif /* _LINUX_CGROUP_NAMESPACE_H */
18 22 23 4 4 4 4 4 4 4 4 2 2 13 13 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Fast and scalable bitmaps. * * Copyright (C) 2016 Facebook * Copyright (C) 2013-2014 Jens Axboe */ #ifndef __LINUX_SCALE_BITMAP_H #define __LINUX_SCALE_BITMAP_H #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/cache.h> #include <linux/list.h> #include <linux/log2.h> #include <linux/minmax.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/smp.h> #include <linux/types.h> #include <linux/wait.h> struct seq_file; /** * struct sbitmap_word - Word in a &struct sbitmap. */ struct sbitmap_word { /** * @word: word holding free bits */ unsigned long word; /** * @cleared: word holding cleared bits */ unsigned long cleared ____cacheline_aligned_in_smp; /** * @swap_lock: serializes simultaneous updates of ->word and ->cleared */ raw_spinlock_t swap_lock; } ____cacheline_aligned_in_smp; /** * struct sbitmap - Scalable bitmap. * * A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This * trades off higher memory usage for better scalability. */ struct sbitmap { /** * @depth: Number of bits used in the whole bitmap. */ unsigned int depth; /** * @shift: log2(number of bits used per word) */ unsigned int shift; /** * @map_nr: Number of words (cachelines) being used for the bitmap. */ unsigned int map_nr; /** * @round_robin: Allocate bits in strict round-robin order. */ bool round_robin; /** * @map: Allocated bitmap. */ struct sbitmap_word *map; /** * @alloc_hint: Cache of last successfully allocated or freed bit. * * This is per-cpu, which allows multiple users to stick to different * cachelines until the map is exhausted. */ unsigned int __percpu *alloc_hint; }; #define SBQ_WAIT_QUEUES 8 #define SBQ_WAKE_BATCH 8 /** * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue. */ struct sbq_wait_state { /** * @wait: Wait queue. */ wait_queue_head_t wait; } ____cacheline_aligned_in_smp; /** * struct sbitmap_queue - Scalable bitmap with the added ability to wait on free * bits. * * A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to * avoid contention on the wait queue spinlock. This ensures that we don't hit a * scalability wall when we run out of free bits and have to start putting tasks * to sleep. */ struct sbitmap_queue { /** * @sb: Scalable bitmap. */ struct sbitmap sb; /** * @wake_batch: Number of bits which must be freed before we wake up any * waiters. */ unsigned int wake_batch; /** * @wake_index: Next wait queue in @ws to wake up. */ atomic_t wake_index; /** * @ws: Wait queues. */ struct sbq_wait_state *ws; /** * @ws_active: count of currently active ws waitqueues */ atomic_t ws_active; /** * @min_shallow_depth: The minimum shallow depth which may be passed to * sbitmap_queue_get_shallow() */ unsigned int min_shallow_depth; /** * @completion_cnt: Number of bits cleared passed to the * wakeup function. */ atomic_t completion_cnt; /** * @wakeup_cnt: Number of thread wake ups issued. */ atomic_t wakeup_cnt; }; /** * sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node. * @sb: Bitmap to initialize. * @depth: Number of bits to allocate. * @shift: Use 2^@shift bits per word in the bitmap; if a negative number if * given, a good default is chosen. * @flags: Allocation flags. * @node: Memory node to allocate on. * @round_robin: If true, be stricter about allocation order; always allocate * starting from the last allocated bit. This is less efficient * than the default behavior (false). * @alloc_hint: If true, apply percpu hint for where to start searching for * a free bit. * * Return: Zero on success or negative errno on failure. */ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, gfp_t flags, int node, bool round_robin, bool alloc_hint); /* sbitmap internal helper */ static inline unsigned int __map_depth(const struct sbitmap *sb, int index) { if (index == sb->map_nr - 1) return sb->depth - (index << sb->shift); return 1U << sb->shift; } /** * sbitmap_free() - Free memory used by a &struct sbitmap. * @sb: Bitmap to free. */ static inline void sbitmap_free(struct sbitmap *sb) { free_percpu(sb->alloc_hint); kvfree(sb->map); sb->map = NULL; } /** * sbitmap_resize() - Resize a &struct sbitmap. * @sb: Bitmap to resize. * @depth: New number of bits to resize to. * * Doesn't reallocate anything. It's up to the caller to ensure that the new * depth doesn't exceed the depth that the sb was initialized with. */ void sbitmap_resize(struct sbitmap *sb, unsigned int depth); /** * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap. * @sb: Bitmap to allocate from. * * This operation provides acquire barrier semantics if it succeeds. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int sbitmap_get(struct sbitmap *sb); /** * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap. * @sb: Bitmap to check. * * Return: true if any bit in the bitmap is set, false otherwise. */ bool sbitmap_any_bit_set(const struct sbitmap *sb); #define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) #define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); /** * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. * @start: Where to start the iteration. * @sb: Bitmap to iterate over. * @fn: Callback. Should return true to continue or false to break early. * @data: Pointer to pass to callback. * * This is inline even though it's non-trivial so that the function calls to the * callback will hopefully get optimized away. */ static inline void __sbitmap_for_each_set(struct sbitmap *sb, unsigned int start, sb_for_each_fn fn, void *data) { unsigned int index; unsigned int nr; unsigned int scanned = 0; if (start >= sb->depth) start = 0; index = SB_NR_TO_INDEX(sb, start); nr = SB_NR_TO_BIT(sb, start); while (scanned < sb->depth) { unsigned long word; unsigned int depth = min_t(unsigned int, __map_depth(sb, index) - nr, sb->depth - scanned); scanned += depth; word = sb->map[index].word & ~sb->map[index].cleared; if (!word) goto next; /* * On the first iteration of the outer loop, we need to add the * bit offset back to the size of the word for find_next_bit(). * On all other iterations, nr is zero, so this is a noop. */ depth += nr; while (1) { nr = find_next_bit(&word, depth, nr); if (nr >= depth) break; if (!fn(sb, (index << sb->shift) + nr, data)) return; nr++; } next: nr = 0; if (++index >= sb->map_nr) index = 0; } } /** * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. * @sb: Bitmap to iterate over. * @fn: Callback. Should return true to continue or false to break early. * @data: Pointer to pass to callback. */ static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, void *data) { __sbitmap_for_each_set(sb, 0, fn, data); } static inline unsigned long *__sbitmap_word(struct sbitmap *sb, unsigned int bitnr) { return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word; } /* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */ static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr) { set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) { clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } /* * This one is special, since it doesn't actually clear the bit, rather it * sets the corresponding bit in the ->cleared mask instead. Paired with * the caller doing sbitmap_deferred_clear() if a given index is full, which * will clear the previously freed entries in the corresponding ->word. */ static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr) { unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared; set_bit(SB_NR_TO_BIT(sb, bitnr), addr); } /* * Pair of sbitmap_get, and this one applies both cleared bit and * allocation hint. */ static inline void sbitmap_put(struct sbitmap *sb, unsigned int bitnr) { sbitmap_deferred_clear_bit(sb, bitnr); if (likely(sb->alloc_hint && !sb->round_robin && bitnr < sb->depth)) *raw_cpu_ptr(sb->alloc_hint) = bitnr; } static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr) { return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } static inline int sbitmap_calculate_shift(unsigned int depth) { int shift = ilog2(BITS_PER_LONG); /* * If the bitmap is small, shrink the number of bits per word so * we spread over a few cachelines, at least. If less than 4 * bits, just forget about it, it's not going to work optimally * anyway. */ if (depth >= 4) { while ((4U << shift) > depth) shift--; } return shift; } /** * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file. * @sb: Bitmap to show. * @m: struct seq_file to write to. * * This is intended for debugging. The format may change at any time. */ void sbitmap_show(struct sbitmap *sb, struct seq_file *m); /** * sbitmap_weight() - Return how many set and not cleared bits in a &struct * sbitmap. * @sb: Bitmap to check. * * Return: How many set and not cleared bits set */ unsigned int sbitmap_weight(const struct sbitmap *sb); /** * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct * seq_file. * @sb: Bitmap to show. * @m: struct seq_file to write to. * * This is intended for debugging. The output isn't guaranteed to be internally * consistent. */ void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m); /** * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific * memory node. * @sbq: Bitmap queue to initialize. * @depth: See sbitmap_init_node(). * @shift: See sbitmap_init_node(). * @round_robin: See sbitmap_get(). * @flags: Allocation flags. * @node: Memory node to allocate on. * * Return: Zero on success or negative errno on failure. */ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, int shift, bool round_robin, gfp_t flags, int node); /** * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue. * * @sbq: Bitmap queue to free. */ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) { kfree(sbq->ws); sbitmap_free(&sbq->sb); } /** * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch * @sbq: Bitmap queue to recalculate wake batch. * @users: Number of shares. * * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch * by depth. This interface is for HCTX shared tags or queue shared tags. */ void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, unsigned int users); /** * sbitmap_queue_resize() - Resize a &struct sbitmap_queue. * @sbq: Bitmap queue to resize. * @depth: New number of bits to resize to. * * Like sbitmap_resize(), this doesn't reallocate anything. It has to do * some extra work on the &struct sbitmap_queue, so it's not safe to just * resize the underlying &struct sbitmap. */ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth); /** * __sbitmap_queue_get() - Try to allocate a free bit from a &struct * sbitmap_queue with preemption already disabled. * @sbq: Bitmap queue to allocate from. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int __sbitmap_queue_get(struct sbitmap_queue *sbq); /** * __sbitmap_queue_get_batch() - Try to allocate a batch of free bits * @sbq: Bitmap queue to allocate from. * @nr_tags: number of tags requested * @offset: offset to add to returned bits * * Return: Mask of allocated tags, 0 if none are found. Each tag allocated is * a bit in the mask returned, and the caller must add @offset to the value to * get the absolute tag value. */ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, unsigned int *offset); /** * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct * sbitmap_queue, limiting the depth used from each word, with preemption * already disabled. * @sbq: Bitmap queue to allocate from. * @shallow_depth: The maximum number of bits to allocate from the queue. * See sbitmap_get_shallow(). * * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after * initializing @sbq. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int shallow_depth); /** * sbitmap_queue_get() - Try to allocate a free bit from a &struct * sbitmap_queue. * @sbq: Bitmap queue to allocate from. * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to * sbitmap_queue_clear()). * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, unsigned int *cpu) { int nr; *cpu = get_cpu(); nr = __sbitmap_queue_get(sbq); put_cpu(); return nr; } /** * sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the * minimum shallow depth that will be used. * @sbq: Bitmap queue in question. * @min_shallow_depth: The minimum shallow depth that will be passed to * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow(). * * sbitmap_queue_clear() batches wakeups as an optimization. The batch size * depends on the depth of the bitmap. Since the shallow allocation functions * effectively operate with a different depth, the shallow depth must be taken * into account when calculating the batch size. This function must be called * with the minimum shallow depth that will be used. Failure to do so can result * in missed wakeups. */ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, unsigned int min_shallow_depth); /** * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a * &struct sbitmap_queue. * @sbq: Bitmap to free from. * @nr: Bit number to free. * @cpu: CPU the bit was allocated on. */ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu); /** * sbitmap_queue_clear_batch() - Free a batch of allocated bits * &struct sbitmap_queue. * @sbq: Bitmap to free from. * @offset: offset for each tag in array * @tags: array of tags * @nr_tags: number of tags in array */ void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, int *tags, int nr_tags); static inline int sbq_index_inc(int index) { return (index + 1) & (SBQ_WAIT_QUEUES - 1); } static inline void sbq_index_atomic_inc(atomic_t *index) { int old = atomic_read(index); int new = sbq_index_inc(old); atomic_cmpxchg(index, old, new); } /** * sbq_wait_ptr() - Get the next wait queue to use for a &struct * sbitmap_queue. * @sbq: Bitmap queue to wait on. * @wait_index: A counter per "user" of @sbq. * * Return: Next wait queue to be used */ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, atomic_t *wait_index) { struct sbq_wait_state *ws; ws = &sbq->ws[atomic_read(wait_index)]; sbq_index_atomic_inc(wait_index); return ws; } /** * sbitmap_queue_wake_all() - Wake up everything waiting on a &struct * sbitmap_queue. * @sbq: Bitmap queue to wake up. */ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); /** * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue * on a &struct sbitmap_queue. * @sbq: Bitmap queue to wake up. * @nr: Number of bits cleared. */ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr); /** * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct * seq_file. * @sbq: Bitmap queue to show. * @m: struct seq_file to write to. * * This is intended for debugging. The format may change at any time. */ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m); struct sbq_wait { struct sbitmap_queue *sbq; /* if set, sbq_wait is accounted */ struct wait_queue_entry wait; }; #define DEFINE_SBQ_WAIT(name) \ struct sbq_wait name = { \ .sbq = NULL, \ .wait = { \ .private = current, \ .func = autoremove_wake_function, \ .entry = LIST_HEAD_INIT((name).wait.entry), \ } \ } /* * Wrapper around prepare_to_wait_exclusive(), which maintains some extra * internal state. */ void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait, int state); /* * Must be paired with sbitmap_prepare_to_wait(). */ void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait); /* * Wrapper around add_wait_queue(), which maintains some extra internal state */ void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait); /* * Must be paired with sbitmap_add_wait_queue() */ void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait); #endif /* __LINUX_SCALE_BITMAP_H */
21 21 3 1 3 2 2 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor policy manipulation functions * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2017 Canonical Ltd. * * AppArmor policy namespaces, allow for different sets of policies * to be loaded for tasks within the namespace. */ #include <linux/list.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/string.h> #include "include/apparmor.h" #include "include/cred.h" #include "include/policy_ns.h" #include "include/label.h" #include "include/policy.h" /* kernel label */ struct aa_label *kernel_t; /* root profile namespace */ struct aa_ns *root_ns; const char *aa_hidden_ns_name = "---"; /** * aa_ns_visible - test if @view is visible from @curr * @curr: namespace to treat as the parent (NOT NULL) * @view: namespace to test if visible from @curr (NOT NULL) * @subns: whether view of a subns is allowed * * Returns: true if @view is visible from @curr else false */ bool aa_ns_visible(struct aa_ns *curr, struct aa_ns *view, bool subns) { if (curr == view) return true; if (!subns) return false; for ( ; view; view = view->parent) { if (view->parent == curr) return true; } return false; } /** * aa_ns_name - Find the ns name to display for @view from @curr * @curr: current namespace (NOT NULL) * @view: namespace attempting to view (NOT NULL) * @subns: are subns visible * * Returns: name of @view visible from @curr */ const char *aa_ns_name(struct aa_ns *curr, struct aa_ns *view, bool subns) { /* if view == curr then the namespace name isn't displayed */ if (curr == view) return ""; if (aa_ns_visible(curr, view, subns)) { /* at this point if a ns is visible it is in a view ns * thus the curr ns.hname is a prefix of its name. * Only output the virtualized portion of the name * Add + 2 to skip over // separating curr hname prefix * from the visible tail of the views hname */ return view->base.hname + strlen(curr->base.hname) + 2; } return aa_hidden_ns_name; } static struct aa_profile *alloc_unconfined(const char *name) { struct aa_profile *profile; profile = aa_alloc_null(NULL, name, GFP_KERNEL); if (!profile) return NULL; profile->label.flags |= FLAG_IX_ON_NAME_ERROR | FLAG_IMMUTIBLE | FLAG_NS_COUNT | FLAG_UNCONFINED; profile->mode = APPARMOR_UNCONFINED; return profile; } /** * alloc_ns - allocate, initialize and return a new namespace * @prefix: parent namespace name (MAYBE NULL) * @name: a preallocated name (NOT NULL) * * Returns: refcounted namespace or NULL on failure. */ static struct aa_ns *alloc_ns(const char *prefix, const char *name) { struct aa_ns *ns; ns = kzalloc(sizeof(*ns), GFP_KERNEL); AA_DEBUG(DEBUG_POLICY, "%s(%p)\n", __func__, ns); if (!ns) return NULL; if (!aa_policy_init(&ns->base, prefix, name, GFP_KERNEL)) goto fail_ns; INIT_LIST_HEAD(&ns->sub_ns); INIT_LIST_HEAD(&ns->rawdata_list); mutex_init(&ns->lock); init_waitqueue_head(&ns->wait); /* released by aa_free_ns() */ ns->unconfined = alloc_unconfined("unconfined"); if (!ns->unconfined) goto fail_unconfined; /* ns and ns->unconfined share ns->unconfined refcount */ ns->unconfined->ns = ns; atomic_set(&ns->uniq_null, 0); aa_labelset_init(&ns->labels); return ns; fail_unconfined: aa_policy_destroy(&ns->base); fail_ns: kfree_sensitive(ns); return NULL; } /** * aa_free_ns - free a profile namespace * @ns: the namespace to free (MAYBE NULL) * * Requires: All references to the namespace must have been put, if the * namespace was referenced by a profile confining a task, */ void aa_free_ns(struct aa_ns *ns) { if (!ns) return; aa_policy_destroy(&ns->base); aa_labelset_destroy(&ns->labels); aa_put_ns(ns->parent); ns->unconfined->ns = NULL; aa_free_profile(ns->unconfined); kfree_sensitive(ns); } /** * __aa_lookupn_ns - lookup the namespace matching @hname * @view: namespace to search in (NOT NULL) * @hname: hierarchical ns name (NOT NULL) * @n: length of @hname * * Requires: rcu_read_lock be held * * Returns: unrefcounted ns pointer or NULL if not found * * Do a relative name lookup, recursing through profile tree. */ struct aa_ns *__aa_lookupn_ns(struct aa_ns *view, const char *hname, size_t n) { struct aa_ns *ns = view; const char *split; for (split = strnstr(hname, "//", n); split; split = strnstr(hname, "//", n)) { ns = __aa_findn_ns(&ns->sub_ns, hname, split - hname); if (!ns) return NULL; n -= split + 2 - hname; hname = split + 2; } if (n) return __aa_findn_ns(&ns->sub_ns, hname, n); return NULL; } /** * aa_lookupn_ns - look up a policy namespace relative to @view * @view: namespace to search in (NOT NULL) * @name: name of namespace to find (NOT NULL) * @n: length of @name * * Returns: a refcounted namespace on the list, or NULL if no namespace * called @name exists. * * refcount released by caller */ struct aa_ns *aa_lookupn_ns(struct aa_ns *view, const char *name, size_t n) { struct aa_ns *ns = NULL; rcu_read_lock(); ns = aa_get_ns(__aa_lookupn_ns(view, name, n)); rcu_read_unlock(); return ns; } static struct aa_ns *__aa_create_ns(struct aa_ns *parent, const char *name, struct dentry *dir) { struct aa_ns *ns; int error; AA_BUG(!parent); AA_BUG(!name); AA_BUG(!mutex_is_locked(&parent->lock)); ns = alloc_ns(parent->base.hname, name); if (!ns) return ERR_PTR(-ENOMEM); ns->level = parent->level + 1; mutex_lock_nested(&ns->lock, ns->level); error = __aafs_ns_mkdir(ns, ns_subns_dir(parent), name, dir); if (error) { AA_ERROR("Failed to create interface for ns %s\n", ns->base.name); mutex_unlock(&ns->lock); aa_free_ns(ns); return ERR_PTR(error); } ns->parent = aa_get_ns(parent); list_add_rcu(&ns->base.list, &parent->sub_ns); /* add list ref */ aa_get_ns(ns); mutex_unlock(&ns->lock); return ns; } /** * __aa_find_or_create_ns - create an ns, fail if it already exists * @parent: the parent of the namespace being created * @name: the name of the namespace * @dir: if not null the dir to put the ns entries in * * Returns: the a refcounted ns that has been add or an ERR_PTR */ struct aa_ns *__aa_find_or_create_ns(struct aa_ns *parent, const char *name, struct dentry *dir) { struct aa_ns *ns; AA_BUG(!mutex_is_locked(&parent->lock)); /* try and find the specified ns */ /* released by caller */ ns = aa_get_ns(__aa_find_ns(&parent->sub_ns, name)); if (!ns) ns = __aa_create_ns(parent, name, dir); else ns = ERR_PTR(-EEXIST); /* return ref */ return ns; } /** * aa_prepare_ns - find an existing or create a new namespace of @name * @parent: ns to treat as parent * @name: the namespace to find or add (NOT NULL) * * Returns: refcounted namespace or PTR_ERR if failed to create one */ struct aa_ns *aa_prepare_ns(struct aa_ns *parent, const char *name) { struct aa_ns *ns; mutex_lock_nested(&parent->lock, parent->level); /* try and find the specified ns and if it doesn't exist create it */ /* released by caller */ ns = aa_get_ns(__aa_find_ns(&parent->sub_ns, name)); if (!ns) ns = __aa_create_ns(parent, name, NULL); mutex_unlock(&parent->lock); /* return ref */ return ns; } static void __ns_list_release(struct list_head *head); /** * destroy_ns - remove everything contained by @ns * @ns: namespace to have it contents removed (NOT NULL) */ static void destroy_ns(struct aa_ns *ns) { if (!ns) return; mutex_lock_nested(&ns->lock, ns->level); /* release all profiles in this namespace */ __aa_profile_list_release(&ns->base.profiles); /* release all sub namespaces */ __ns_list_release(&ns->sub_ns); if (ns->parent) { unsigned long flags; write_lock_irqsave(&ns->labels.lock, flags); __aa_proxy_redirect(ns_unconfined(ns), ns_unconfined(ns->parent)); write_unlock_irqrestore(&ns->labels.lock, flags); } __aafs_ns_rmdir(ns); mutex_unlock(&ns->lock); } /** * __aa_remove_ns - remove a namespace and all its children * @ns: namespace to be removed (NOT NULL) * * Requires: ns->parent->lock be held and ns removed from parent. */ void __aa_remove_ns(struct aa_ns *ns) { /* remove ns from namespace list */ list_del_rcu(&ns->base.list); destroy_ns(ns); aa_put_ns(ns); } /** * __ns_list_release - remove all profile namespaces on the list put refs * @head: list of profile namespaces (NOT NULL) * * Requires: namespace lock be held */ static void __ns_list_release(struct list_head *head) { struct aa_ns *ns, *tmp; list_for_each_entry_safe(ns, tmp, head, base.list) __aa_remove_ns(ns); } /** * aa_alloc_root_ns - allocate the root profile namespace * * Returns: %0 on success else error * */ int __init aa_alloc_root_ns(void) { struct aa_profile *kernel_p; /* released by aa_free_root_ns - used as list ref*/ root_ns = alloc_ns(NULL, "root"); if (!root_ns) return -ENOMEM; kernel_p = alloc_unconfined("kernel_t"); if (!kernel_p) { destroy_ns(root_ns); aa_free_ns(root_ns); return -ENOMEM; } kernel_t = &kernel_p->label; root_ns->unconfined->ns = aa_get_ns(root_ns); return 0; } /** * aa_free_root_ns - free the root profile namespace */ void __init aa_free_root_ns(void) { struct aa_ns *ns = root_ns; root_ns = NULL; aa_label_free(kernel_t); destroy_ns(ns); aa_put_ns(ns); }
3 1 1 1 1 1 1 1 1 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ /* * Basic idea behind the notification queue: An fsnotify group (like inotify) * sends the userspace notification about events asynchronously some time after * the event happened. When inotify gets an event it will need to add that * event to the group notify queue. Since a single event might need to be on * multiple group's notification queues we can't add the event directly to each * queue and instead add a small "event_holder" to each queue. This event_holder * has a pointer back to the original event. Since the majority of events are * going to end up on one, and only one, notification queue we embed one * event_holder into each event. This means we have a single allocation instead * of always needing two. If the embedded event_holder is already in use by * another group a new event_holder (from fsnotify_event_holder_cachep) will be * allocated and used. */ #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/mutex.h> #include <linux/namei.h> #include <linux/path.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); /** * fsnotify_get_cookie - return a unique cookie for use in synchronizing events. * Called from fsnotify_move, which is inlined into filesystem modules. */ u32 fsnotify_get_cookie(void) { return atomic_inc_return(&fsnotify_sync_cookie); } EXPORT_SYMBOL_GPL(fsnotify_get_cookie); void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event) { /* Overflow events are per-group and we don't want to free them */ if (!event || event == group->overflow_event) return; /* * If the event is still queued, we have a problem... Do an unreliable * lockless check first to avoid locking in the common case. The * locking may be necessary for permission events which got removed * from the list by a different CPU than the one freeing the event. */ if (!list_empty(&event->list)) { spin_lock(&group->notification_lock); WARN_ON(!list_empty(&event->list)); spin_unlock(&group->notification_lock); } group->ops->free_event(group, event); } /* * Try to add an event to the notification queue. * The group can later pull this event off the queue to deal with. * The group can use the @merge hook to merge the event with a queued event. * The group can use the @insert hook to insert the event into hash table. * The function returns: * 0 if the event was added to a queue * 1 if the event was merged with some other queued event * 2 if the event was not queued - either the queue of events has overflown * or the group is shutting down. */ int fsnotify_insert_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct fsnotify_group *, struct fsnotify_event *), void (*insert)(struct fsnotify_group *, struct fsnotify_event *)) { int ret = 0; struct list_head *list = &group->notification_list; pr_debug("%s: group=%p event=%p\n", __func__, group, event); spin_lock(&group->notification_lock); if (group->shutdown) { spin_unlock(&group->notification_lock); return 2; } if (event == group->overflow_event || group->q_len >= group->max_events) { ret = 2; /* Queue overflow event only if it isn't already queued */ if (!list_empty(&group->overflow_event->list)) { spin_unlock(&group->notification_lock); return ret; } event = group->overflow_event; goto queue; } if (!list_empty(list) && merge) { ret = merge(group, event); if (ret) { spin_unlock(&group->notification_lock); return ret; } } queue: group->q_len++; list_add_tail(&event->list, list); if (insert) insert(group, event); spin_unlock(&group->notification_lock); wake_up(&group->notification_waitq); kill_fasync(&group->fsn_fa, SIGIO, POLL_IN); return ret; } void fsnotify_remove_queued_event(struct fsnotify_group *group, struct fsnotify_event *event) { assert_spin_locked(&group->notification_lock); /* * We need to init list head for the case of overflow event so that * check in fsnotify_add_event() works */ list_del_init(&event->list); group->q_len--; } /* * Return the first event on the notification list without removing it. * Returns NULL if the list is empty. */ struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) { assert_spin_locked(&group->notification_lock); if (fsnotify_notify_queue_is_empty(group)) return NULL; return list_first_entry(&group->notification_list, struct fsnotify_event, list); } /* * Remove and return the first event from the notification list. It is the * responsibility of the caller to destroy the obtained event */ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) { struct fsnotify_event *event = fsnotify_peek_first_event(group); if (!event) return NULL; pr_debug("%s: group=%p event=%p\n", __func__, group, event); fsnotify_remove_queued_event(group, event); return event; } /* * Called when a group is being torn down to clean up any outstanding * event notifications. */ void fsnotify_flush_notify(struct fsnotify_group *group) { struct fsnotify_event *event; spin_lock(&group->notification_lock); while (!fsnotify_notify_queue_is_empty(group)) { event = fsnotify_remove_first_event(group); spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, event); spin_lock(&group->notification_lock); } spin_unlock(&group->notification_lock); }
5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 // SPDX-License-Identifier: GPL-2.0 #include <linux/build_bug.h> #include <linux/errno.h> #include <linux/errname.h> #include <linux/kernel.h> #include <linux/math.h> /* * Ensure these tables do not accidentally become gigantic if some * huge errno makes it in. On most architectures, the first table will * only have about 140 entries, but mips and parisc have more sparsely * allocated errnos (with EHWPOISON = 257 on parisc, and EDQUOT = 1133 * on mips), so this wastes a bit of space on those - though we * special case the EDQUOT case. */ #define E(err) [err + BUILD_BUG_ON_ZERO(err <= 0 || err > 300)] = "-" #err static const char *names_0[] = { E(E2BIG), E(EACCES), E(EADDRINUSE), E(EADDRNOTAVAIL), E(EADV), E(EAFNOSUPPORT), E(EAGAIN), /* EWOULDBLOCK */ E(EALREADY), E(EBADE), E(EBADF), E(EBADFD), E(EBADMSG), E(EBADR), E(EBADRQC), E(EBADSLT), E(EBFONT), E(EBUSY), E(ECANCELED), /* ECANCELLED */ E(ECHILD), E(ECHRNG), E(ECOMM), E(ECONNABORTED), E(ECONNREFUSED), /* EREFUSED */ E(ECONNRESET), E(EDEADLK), /* EDEADLOCK */ #if EDEADLK != EDEADLOCK /* mips, sparc, powerpc */ E(EDEADLOCK), #endif E(EDESTADDRREQ), E(EDOM), E(EDOTDOT), #ifndef CONFIG_MIPS E(EDQUOT), #endif E(EEXIST), E(EFAULT), E(EFBIG), E(EHOSTDOWN), E(EHOSTUNREACH), E(EHWPOISON), E(EIDRM), E(EILSEQ), #ifdef EINIT E(EINIT), #endif E(EINPROGRESS), E(EINTR), E(EINVAL), E(EIO), E(EISCONN), E(EISDIR), E(EISNAM), E(EKEYEXPIRED), E(EKEYREJECTED), E(EKEYREVOKED), E(EL2HLT), E(EL2NSYNC), E(EL3HLT), E(EL3RST), E(ELIBACC), E(ELIBBAD), E(ELIBEXEC), E(ELIBMAX), E(ELIBSCN), E(ELNRNG), E(ELOOP), E(EMEDIUMTYPE), E(EMFILE), E(EMLINK), E(EMSGSIZE), E(EMULTIHOP), E(ENAMETOOLONG), E(ENAVAIL), E(ENETDOWN), E(ENETRESET), E(ENETUNREACH), E(ENFILE), E(ENOANO), E(ENOBUFS), E(ENOCSI), E(ENODATA), E(ENODEV), E(ENOENT), E(ENOEXEC), E(ENOKEY), E(ENOLCK), E(ENOLINK), E(ENOMEDIUM), E(ENOMEM), E(ENOMSG), E(ENONET), E(ENOPKG), E(ENOPROTOOPT), E(ENOSPC), E(ENOSR), E(ENOSTR), E(ENOSYS), E(ENOTBLK), E(ENOTCONN), E(ENOTDIR), E(ENOTEMPTY), E(ENOTNAM), E(ENOTRECOVERABLE), E(ENOTSOCK), E(ENOTTY), E(ENOTUNIQ), E(ENXIO), E(EOPNOTSUPP), E(EOVERFLOW), E(EOWNERDEAD), E(EPERM), E(EPFNOSUPPORT), E(EPIPE), #ifdef EPROCLIM E(EPROCLIM), #endif E(EPROTO), E(EPROTONOSUPPORT), E(EPROTOTYPE), E(ERANGE), E(EREMCHG), #ifdef EREMDEV E(EREMDEV), #endif E(EREMOTE), E(EREMOTEIO), E(ERESTART), E(ERFKILL), E(EROFS), #ifdef ERREMOTE E(ERREMOTE), #endif E(ESHUTDOWN), E(ESOCKTNOSUPPORT), E(ESPIPE), E(ESRCH), E(ESRMNT), E(ESTALE), E(ESTRPIPE), E(ETIME), E(ETIMEDOUT), E(ETOOMANYREFS), E(ETXTBSY), E(EUCLEAN), E(EUNATCH), E(EUSERS), E(EXDEV), E(EXFULL), }; #undef E #ifdef EREFUSED /* parisc */ static_assert(EREFUSED == ECONNREFUSED); #endif #ifdef ECANCELLED /* parisc */ static_assert(ECANCELLED == ECANCELED); #endif static_assert(EAGAIN == EWOULDBLOCK); /* everywhere */ #define E(err) [err - 512 + BUILD_BUG_ON_ZERO(err < 512 || err > 550)] = "-" #err static const char *names_512[] = { E(ERESTARTSYS), E(ERESTARTNOINTR), E(ERESTARTNOHAND), E(ENOIOCTLCMD), E(ERESTART_RESTARTBLOCK), E(EPROBE_DEFER), E(EOPENSTALE), E(ENOPARAM), E(EBADHANDLE), E(ENOTSYNC), E(EBADCOOKIE), E(ENOTSUPP), E(ETOOSMALL), E(ESERVERFAULT), E(EBADTYPE), E(EJUKEBOX), E(EIOCBQUEUED), E(ERECALLCONFLICT), }; #undef E static const char *__errname(unsigned err) { if (err < ARRAY_SIZE(names_0)) return names_0[err]; if (err >= 512 && err - 512 < ARRAY_SIZE(names_512)) return names_512[err - 512]; /* But why? */ if (IS_ENABLED(CONFIG_MIPS) && err == EDQUOT) /* 1133 */ return "-EDQUOT"; return NULL; } /* * errname(EIO) -> "EIO" * errname(-EIO) -> "-EIO" */ const char *errname(int err) { const char *name = __errname(abs(err)); if (!name) return NULL; return err > 0 ? name + 1 : name; } EXPORT_SYMBOL(errname);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: GPL-2.0 */ /* * 'Generic' ticket-lock implementation. * * It relies on atomic_fetch_add() having well defined forward progress * guarantees under contention. If your architecture cannot provide this, stick * to a test-and-set lock. * * It also relies on atomic_fetch_add() being safe vs smp_store_release() on a * sub-word of the value. This is generally true for anything LL/SC although * you'd be hard pressed to find anything useful in architecture specifications * about this. If your architecture cannot do this you might be better off with * a test-and-set. * * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence * uses atomic_fetch_add() which is RCsc to create an RCsc hot path, along with * a full fence after the spin to upgrade the otherwise-RCpc * atomic_cond_read_acquire(). * * The implementation uses smp_cond_load_acquire() to spin, so if the * architecture has WFE like instructions to sleep instead of poll for word * modifications be sure to implement that (see ARM64 for example). * */ #ifndef __ASM_GENERIC_TICKET_SPINLOCK_H #define __ASM_GENERIC_TICKET_SPINLOCK_H #include <linux/atomic.h> #include <asm-generic/spinlock_types.h> static __always_inline void ticket_spin_lock(arch_spinlock_t *lock) { u32 val = atomic_fetch_add(1<<16, &lock->val); u16 ticket = val >> 16; if (ticket == (u16)val) return; /* * atomic_cond_read_acquire() is RCpc, but rather than defining a * custom cond_read_rcsc() here we just emit a full fence. We only * need the prior reads before subsequent writes ordering from * smb_mb(), but as atomic_cond_read_acquire() just emits reads and we * have no outstanding writes due to the atomic_fetch_add() the extra * orderings are free. */ atomic_cond_read_acquire(&lock->val, ticket == (u16)VAL); smp_mb(); } static __always_inline bool ticket_spin_trylock(arch_spinlock_t *lock) { u32 old = atomic_read(&lock->val); if ((old >> 16) != (old & 0xffff)) return false; return atomic_try_cmpxchg(&lock->val, &old, old + (1<<16)); /* SC, for RCsc */ } static __always_inline void ticket_spin_unlock(arch_spinlock_t *lock) { u16 *ptr = (u16 *)lock + IS_ENABLED(CONFIG_CPU_BIG_ENDIAN); u32 val = atomic_read(&lock->val); smp_store_release(ptr, (u16)val + 1); } static __always_inline int ticket_spin_value_unlocked(arch_spinlock_t lock) { u32 val = lock.val.counter; return ((val >> 16) == (val & 0xffff)); } static __always_inline int ticket_spin_is_locked(arch_spinlock_t *lock) { arch_spinlock_t val = READ_ONCE(*lock); return !ticket_spin_value_unlocked(val); } static __always_inline int ticket_spin_is_contended(arch_spinlock_t *lock) { u32 val = atomic_read(&lock->val); return (s16)((val >> 16) - (val & 0xffff)) > 1; } #ifndef __no_arch_spinlock_redefine /* * Remapping spinlock architecture specific functions to the corresponding * ticket spinlock functions. */ #define arch_spin_is_locked(l) ticket_spin_is_locked(l) #define arch_spin_is_contended(l) ticket_spin_is_contended(l) #define arch_spin_value_unlocked(l) ticket_spin_value_unlocked(l) #define arch_spin_lock(l) ticket_spin_lock(l) #define arch_spin_trylock(l) ticket_spin_trylock(l) #define arch_spin_unlock(l) ticket_spin_unlock(l) #endif #endif /* __ASM_GENERIC_TICKET_SPINLOCK_H */
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/can/dev.h * * Definitions for the CAN network device driver interface * * Copyright (C) 2006 Andrey Volkov <avolkov@varma-el.com> * Varma Electronics Oy * * Copyright (C) 2008 Wolfgang Grandegger <wg@grandegger.com> * */ #ifndef _CAN_DEV_H #define _CAN_DEV_H #include <linux/can.h> #include <linux/can/bittiming.h> #include <linux/can/error.h> #include <linux/can/length.h> #include <linux/can/netlink.h> #include <linux/can/skb.h> #include <linux/ethtool.h> #include <linux/netdevice.h> /* * CAN mode */ enum can_mode { CAN_MODE_STOP = 0, CAN_MODE_START, CAN_MODE_SLEEP }; enum can_termination_gpio { CAN_TERMINATION_GPIO_DISABLED = 0, CAN_TERMINATION_GPIO_ENABLED, CAN_TERMINATION_GPIO_MAX, }; /* * CAN common private data */ struct can_priv { struct net_device *dev; struct can_device_stats can_stats; const struct can_bittiming_const *bittiming_const; struct can_bittiming bittiming; struct data_bittiming_params fd, xl; unsigned int bitrate_const_cnt; const u32 *bitrate_const; u32 bitrate_max; struct can_clock clock; unsigned int termination_const_cnt; const u16 *termination_const; u16 termination; struct gpio_desc *termination_gpio; u16 termination_gpio_ohms[CAN_TERMINATION_GPIO_MAX]; unsigned int echo_skb_max; struct sk_buff **echo_skb; enum can_state state; /* CAN controller features - see include/uapi/linux/can/netlink.h */ u32 ctrlmode; /* current options setting */ u32 ctrlmode_supported; /* options that can be modified by netlink */ int restart_ms; struct delayed_work restart_work; int (*do_set_bittiming)(struct net_device *dev); int (*do_set_mode)(struct net_device *dev, enum can_mode mode); int (*do_set_termination)(struct net_device *dev, u16 term); int (*do_get_state)(const struct net_device *dev, enum can_state *state); int (*do_get_berr_counter)(const struct net_device *dev, struct can_berr_counter *bec); }; static inline bool can_fd_tdc_is_enabled(const struct can_priv *priv) { return !!(priv->ctrlmode & CAN_CTRLMODE_FD_TDC_MASK); } static inline bool can_xl_tdc_is_enabled(const struct can_priv *priv) { return !!(priv->ctrlmode & CAN_CTRLMODE_XL_TDC_MASK); } static inline u32 can_get_static_ctrlmode(struct can_priv *priv) { return priv->ctrlmode & ~priv->ctrlmode_supported; } static inline bool can_is_canxl_dev_mtu(unsigned int mtu) { return (mtu >= CANXL_MIN_MTU && mtu <= CANXL_MAX_MTU); } void can_setup(struct net_device *dev); struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, unsigned int txqs, unsigned int rxqs); #define alloc_candev(sizeof_priv, echo_skb_max) \ alloc_candev_mqs(sizeof_priv, echo_skb_max, 1, 1) #define alloc_candev_mq(sizeof_priv, echo_skb_max, count) \ alloc_candev_mqs(sizeof_priv, echo_skb_max, count, count) void free_candev(struct net_device *dev); /* a candev safe wrapper around netdev_priv */ #if IS_ENABLED(CONFIG_CAN_NETLINK) struct can_priv *safe_candev_priv(struct net_device *dev); #else static inline struct can_priv *safe_candev_priv(struct net_device *dev) { return NULL; } #endif int open_candev(struct net_device *dev); void close_candev(struct net_device *dev); void can_set_default_mtu(struct net_device *dev); int __must_check can_set_static_ctrlmode(struct net_device *dev, u32 static_mode); int can_hwtstamp_get(struct net_device *netdev, struct kernel_hwtstamp_config *cfg); int can_hwtstamp_set(struct net_device *netdev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack); int can_ethtool_op_get_ts_info_hwts(struct net_device *dev, struct kernel_ethtool_ts_info *info); int register_candev(struct net_device *dev); void unregister_candev(struct net_device *dev); int can_restart_now(struct net_device *dev); void can_bus_off(struct net_device *dev); const char *can_get_state_str(const enum can_state state); const char *can_get_ctrlmode_str(u32 ctrlmode); static inline bool can_dev_in_xl_only_mode(struct can_priv *priv) { const u32 mixed_mode = CAN_CTRLMODE_FD | CAN_CTRLMODE_XL; /* When CAN XL is enabled but FD is disabled we are running in * the so-called 'CANXL-only mode' where the error signalling is * disabled. This helper function determines the required value * to disable error signalling in the CAN XL controller. * The so-called CC/FD/XL 'mixed mode' requires error signalling. */ return ((priv->ctrlmode & mixed_mode) == CAN_CTRLMODE_XL); } /* drop skb if it does not contain a valid CAN frame for sending */ static inline bool can_dev_dropped_skb(struct net_device *dev, struct sk_buff *skb) { struct can_priv *priv = netdev_priv(dev); u32 silent_mode = priv->ctrlmode & (CAN_CTRLMODE_LISTENONLY | CAN_CTRLMODE_RESTRICTED); if (silent_mode) { netdev_info_once(dev, "interface in %s mode, dropping skb\n", can_get_ctrlmode_str(silent_mode)); goto invalid_skb; } if (!(priv->ctrlmode & CAN_CTRLMODE_FD) && can_is_canfd_skb(skb)) { netdev_info_once(dev, "CAN FD is disabled, dropping skb\n"); goto invalid_skb; } if (can_dev_in_xl_only_mode(priv) && !can_is_canxl_skb(skb)) { netdev_info_once(dev, "Error signaling is disabled, dropping skb\n"); goto invalid_skb; } return can_dropped_invalid_skb(dev, skb); invalid_skb: kfree_skb(skb); dev->stats.tx_dropped++; return true; } void can_state_get_by_berr_counter(const struct net_device *dev, const struct can_berr_counter *bec, enum can_state *tx_state, enum can_state *rx_state); void can_change_state(struct net_device *dev, struct can_frame *cf, enum can_state tx_state, enum can_state rx_state); #ifdef CONFIG_OF void of_can_transceiver(struct net_device *dev); #else static inline void of_can_transceiver(struct net_device *dev) { } #endif extern struct rtnl_link_ops can_link_ops; int can_netlink_register(void); void can_netlink_unregister(void); #endif /* !_CAN_DEV_H */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 /* * xxHash - Extremely Fast Hash algorithm * Copyright (C) 2012-2016, Yann Collet. * * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * This program is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License version 2 as published by the * Free Software Foundation. This program is dual-licensed; you may select * either version 2 of the GNU General Public License ("GPL") or BSD license * ("BSD"). * * You can contact the author at: * - xxHash homepage: https://cyan4973.github.io/xxHash/ * - xxHash source repository: https://github.com/Cyan4973/xxHash */ #include <linux/unaligned.h> #include <linux/errno.h> #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/string.h> #include <linux/xxhash.h> /*-************************************* * Macros **************************************/ #define xxh_rotl32(x, r) ((x << r) | (x >> (32 - r))) #define xxh_rotl64(x, r) ((x << r) | (x >> (64 - r))) #ifdef __LITTLE_ENDIAN # define XXH_CPU_LITTLE_ENDIAN 1 #else # define XXH_CPU_LITTLE_ENDIAN 0 #endif /*-************************************* * Constants **************************************/ static const uint32_t PRIME32_1 = 2654435761U; static const uint32_t PRIME32_2 = 2246822519U; static const uint32_t PRIME32_3 = 3266489917U; static const uint32_t PRIME32_4 = 668265263U; static const uint32_t PRIME32_5 = 374761393U; static const uint64_t PRIME64_1 = 11400714785074694791ULL; static const uint64_t PRIME64_2 = 14029467366897019727ULL; static const uint64_t PRIME64_3 = 1609587929392839161ULL; static const uint64_t PRIME64_4 = 9650029242287828579ULL; static const uint64_t PRIME64_5 = 2870177450012600261ULL; /*-*************************** * Simple Hash Functions ****************************/ static uint32_t xxh32_round(uint32_t seed, const uint32_t input) { seed += input * PRIME32_2; seed = xxh_rotl32(seed, 13); seed *= PRIME32_1; return seed; } uint32_t xxh32(const void *input, const size_t len, const uint32_t seed) { const uint8_t *p = (const uint8_t *)input; const uint8_t *b_end = p + len; uint32_t h32; if (len >= 16) { const uint8_t *const limit = b_end - 16; uint32_t v1 = seed + PRIME32_1 + PRIME32_2; uint32_t v2 = seed + PRIME32_2; uint32_t v3 = seed + 0; uint32_t v4 = seed - PRIME32_1; do { v1 = xxh32_round(v1, get_unaligned_le32(p)); p += 4; v2 = xxh32_round(v2, get_unaligned_le32(p)); p += 4; v3 = xxh32_round(v3, get_unaligned_le32(p)); p += 4; v4 = xxh32_round(v4, get_unaligned_le32(p)); p += 4; } while (p <= limit); h32 = xxh_rotl32(v1, 1) + xxh_rotl32(v2, 7) + xxh_rotl32(v3, 12) + xxh_rotl32(v4, 18); } else { h32 = seed + PRIME32_5; } h32 += (uint32_t)len; while (p + 4 <= b_end) { h32 += get_unaligned_le32(p) * PRIME32_3; h32 = xxh_rotl32(h32, 17) * PRIME32_4; p += 4; } while (p < b_end) { h32 += (*p) * PRIME32_5; h32 = xxh_rotl32(h32, 11) * PRIME32_1; p++; } h32 ^= h32 >> 15; h32 *= PRIME32_2; h32 ^= h32 >> 13; h32 *= PRIME32_3; h32 ^= h32 >> 16; return h32; } EXPORT_SYMBOL(xxh32); static uint64_t xxh64_round(uint64_t acc, const uint64_t input) { acc += input * PRIME64_2; acc = xxh_rotl64(acc, 31); acc *= PRIME64_1; return acc; } static uint64_t xxh64_merge_round(uint64_t acc, uint64_t val) { val = xxh64_round(0, val); acc ^= val; acc = acc * PRIME64_1 + PRIME64_4; return acc; } uint64_t xxh64(const void *input, const size_t len, const uint64_t seed) { const uint8_t *p = (const uint8_t *)input; const uint8_t *const b_end = p + len; uint64_t h64; if (len >= 32) { const uint8_t *const limit = b_end - 32; uint64_t v1 = seed + PRIME64_1 + PRIME64_2; uint64_t v2 = seed + PRIME64_2; uint64_t v3 = seed + 0; uint64_t v4 = seed - PRIME64_1; do { v1 = xxh64_round(v1, get_unaligned_le64(p)); p += 8; v2 = xxh64_round(v2, get_unaligned_le64(p)); p += 8; v3 = xxh64_round(v3, get_unaligned_le64(p)); p += 8; v4 = xxh64_round(v4, get_unaligned_le64(p)); p += 8; } while (p <= limit); h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) + xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18); h64 = xxh64_merge_round(h64, v1); h64 = xxh64_merge_round(h64, v2); h64 = xxh64_merge_round(h64, v3); h64 = xxh64_merge_round(h64, v4); } else { h64 = seed + PRIME64_5; } h64 += (uint64_t)len; while (p + 8 <= b_end) { const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p)); h64 ^= k1; h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; p += 8; } if (p + 4 <= b_end) { h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1; h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; p += 4; } while (p < b_end) { h64 ^= (*p) * PRIME64_5; h64 = xxh_rotl64(h64, 11) * PRIME64_1; p++; } h64 ^= h64 >> 33; h64 *= PRIME64_2; h64 ^= h64 >> 29; h64 *= PRIME64_3; h64 ^= h64 >> 32; return h64; } EXPORT_SYMBOL(xxh64); /*-************************************************** * Advanced Hash Functions ***************************************************/ void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed) { /* use a local state for memcpy() to avoid strict-aliasing warnings */ struct xxh64_state state; memset(&state, 0, sizeof(state)); state.v1 = seed + PRIME64_1 + PRIME64_2; state.v2 = seed + PRIME64_2; state.v3 = seed + 0; state.v4 = seed - PRIME64_1; memcpy(statePtr, &state, sizeof(state)); } EXPORT_SYMBOL(xxh64_reset); int xxh64_update(struct xxh64_state *state, const void *input, const size_t len) { const uint8_t *p = (const uint8_t *)input; const uint8_t *const b_end = p + len; if (input == NULL) return -EINVAL; state->total_len += len; if (state->memsize + len < 32) { /* fill in tmp buffer */ memcpy(((uint8_t *)state->mem64) + state->memsize, input, len); state->memsize += (uint32_t)len; return 0; } if (state->memsize) { /* tmp buffer is full */ uint64_t *p64 = state->mem64; memcpy(((uint8_t *)p64) + state->memsize, input, 32 - state->memsize); state->v1 = xxh64_round(state->v1, get_unaligned_le64(p64)); p64++; state->v2 = xxh64_round(state->v2, get_unaligned_le64(p64)); p64++; state->v3 = xxh64_round(state->v3, get_unaligned_le64(p64)); p64++; state->v4 = xxh64_round(state->v4, get_unaligned_le64(p64)); p += 32 - state->memsize; state->memsize = 0; } if (p + 32 <= b_end) { const uint8_t *const limit = b_end - 32; uint64_t v1 = state->v1; uint64_t v2 = state->v2; uint64_t v3 = state->v3; uint64_t v4 = state->v4; do { v1 = xxh64_round(v1, get_unaligned_le64(p)); p += 8; v2 = xxh64_round(v2, get_unaligned_le64(p)); p += 8; v3 = xxh64_round(v3, get_unaligned_le64(p)); p += 8; v4 = xxh64_round(v4, get_unaligned_le64(p)); p += 8; } while (p <= limit); state->v1 = v1; state->v2 = v2; state->v3 = v3; state->v4 = v4; } if (p < b_end) { memcpy(state->mem64, p, (size_t)(b_end-p)); state->memsize = (uint32_t)(b_end - p); } return 0; } EXPORT_SYMBOL(xxh64_update); uint64_t xxh64_digest(const struct xxh64_state *state) { const uint8_t *p = (const uint8_t *)state->mem64; const uint8_t *const b_end = (const uint8_t *)state->mem64 + state->memsize; uint64_t h64; if (state->total_len >= 32) { const uint64_t v1 = state->v1; const uint64_t v2 = state->v2; const uint64_t v3 = state->v3; const uint64_t v4 = state->v4; h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) + xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18); h64 = xxh64_merge_round(h64, v1); h64 = xxh64_merge_round(h64, v2); h64 = xxh64_merge_round(h64, v3); h64 = xxh64_merge_round(h64, v4); } else { h64 = state->v3 + PRIME64_5; } h64 += (uint64_t)state->total_len; while (p + 8 <= b_end) { const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p)); h64 ^= k1; h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; p += 8; } if (p + 4 <= b_end) { h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1; h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; p += 4; } while (p < b_end) { h64 ^= (*p) * PRIME64_5; h64 = xxh_rotl64(h64, 11) * PRIME64_1; p++; } h64 ^= h64 >> 33; h64 *= PRIME64_2; h64 ^= h64 >> 29; h64 *= PRIME64_3; h64 ^= h64 >> 32; return h64; } EXPORT_SYMBOL(xxh64_digest); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("xxHash");
1 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SUSPEND_H #define _LINUX_SUSPEND_H #include <linux/swap.h> #include <linux/notifier.h> #include <linux/init.h> #include <linux/pm.h> #include <linux/mm.h> #include <linux/freezer.h> #include <asm/errno.h> #ifdef CONFIG_VT extern void pm_set_vt_switch(int); #else static inline void pm_set_vt_switch(int do_switch) { } #endif #ifdef CONFIG_VT_CONSOLE_SLEEP extern void pm_prepare_console(void); extern void pm_restore_console(void); #else static inline void pm_prepare_console(void) { } static inline void pm_restore_console(void) { } #endif typedef int __bitwise suspend_state_t; #define PM_SUSPEND_ON ((__force suspend_state_t) 0) #define PM_SUSPEND_TO_IDLE ((__force suspend_state_t) 1) #define PM_SUSPEND_STANDBY ((__force suspend_state_t) 2) #define PM_SUSPEND_MEM ((__force suspend_state_t) 3) #define PM_SUSPEND_MIN PM_SUSPEND_TO_IDLE #define PM_SUSPEND_MAX ((__force suspend_state_t) 4) /** * struct platform_suspend_ops - Callbacks for managing platform dependent * system sleep states. * * @valid: Callback to determine if given system sleep state is supported by * the platform. * Valid (ie. supported) states are advertised in /sys/power/state. Note * that it still may be impossible to enter given system sleep state if the * conditions aren't right. * There is the %suspend_valid_only_mem function available that can be * assigned to this if the platform only supports mem sleep. * * @begin: Initialise a transition to given system sleep state. * @begin() is executed right prior to suspending devices. The information * conveyed to the platform code by @begin() should be disregarded by it as * soon as @end() is executed. If @begin() fails (ie. returns nonzero), * @prepare(), @enter() and @finish() will not be called by the PM core. * This callback is optional. However, if it is implemented, the argument * passed to @enter() is redundant and should be ignored. * * @prepare: Prepare the platform for entering the system sleep state indicated * by @begin(). * @prepare() is called right after devices have been suspended (ie. the * appropriate .suspend() method has been executed for each device) and * before device drivers' late suspend callbacks are executed. It returns * 0 on success or a negative error code otherwise, in which case the * system cannot enter the desired sleep state (@prepare_late(), @enter(), * and @wake() will not be called in that case). * * @prepare_late: Finish preparing the platform for entering the system sleep * state indicated by @begin(). * @prepare_late is called before disabling nonboot CPUs and after * device drivers' late suspend callbacks have been executed. It returns * 0 on success or a negative error code otherwise, in which case the * system cannot enter the desired sleep state (@enter() will not be * executed). * * @enter: Enter the system sleep state indicated by @begin() or represented by * the argument if @begin() is not implemented. * This callback is mandatory. It returns 0 on success or a negative * error code otherwise, in which case the system cannot enter the desired * sleep state. * * @wake: Called when the system has just left a sleep state, right after * the nonboot CPUs have been enabled and before device drivers' early * resume callbacks are executed. * This callback is optional, but should be implemented by the platforms * that implement @prepare_late(). If implemented, it is always called * after @prepare_late and @enter(), even if one of them fails. * * @finish: Finish wake-up of the platform. * @finish is called right prior to calling device drivers' regular suspend * callbacks. * This callback is optional, but should be implemented by the platforms * that implement @prepare(). If implemented, it is always called after * @enter() and @wake(), even if any of them fails. It is executed after * a failing @prepare. * * @suspend_again: Returns whether the system should suspend again (true) or * not (false). If the platform wants to poll sensors or execute some * code during suspended without invoking userspace and most of devices, * suspend_again callback is the place assuming that periodic-wakeup or * alarm-wakeup is already setup. This allows to execute some codes while * being kept suspended in the view of userland and devices. * * @end: Called by the PM core right after resuming devices, to indicate to * the platform that the system has returned to the working state or * the transition to the sleep state has been aborted. * This callback is optional, but should be implemented by the platforms * that implement @begin(). Accordingly, platforms implementing @begin() * should also provide a @end() which cleans up transitions aborted before * @enter(). * * @recover: Recover the platform from a suspend failure. * Called by the PM core if the suspending of devices fails. * This callback is optional and should only be implemented by platforms * which require special recovery actions in that situation. */ struct platform_suspend_ops { int (*valid)(suspend_state_t state); int (*begin)(suspend_state_t state); int (*prepare)(void); int (*prepare_late)(void); int (*enter)(suspend_state_t state); void (*wake)(void); void (*finish)(void); bool (*suspend_again)(void); void (*end)(void); void (*recover)(void); }; struct platform_s2idle_ops { int (*begin)(void); int (*prepare)(void); int (*prepare_late)(void); void (*check)(void); bool (*wake)(void); void (*restore_early)(void); void (*restore)(void); void (*end)(void); }; #ifdef CONFIG_SUSPEND extern suspend_state_t pm_suspend_target_state; extern suspend_state_t mem_sleep_current; extern suspend_state_t mem_sleep_default; /** * suspend_set_ops - set platform dependent suspend operations * @ops: The new suspend operations to set. */ extern void suspend_set_ops(const struct platform_suspend_ops *ops); extern int suspend_valid_only_mem(suspend_state_t state); extern unsigned int pm_suspend_global_flags; #define PM_SUSPEND_FLAG_FW_SUSPEND BIT(0) #define PM_SUSPEND_FLAG_FW_RESUME BIT(1) #define PM_SUSPEND_FLAG_NO_PLATFORM BIT(2) static inline void pm_suspend_clear_flags(void) { pm_suspend_global_flags = 0; } static inline void pm_set_suspend_via_firmware(void) { pm_suspend_global_flags |= PM_SUSPEND_FLAG_FW_SUSPEND; } static inline void pm_set_resume_via_firmware(void) { pm_suspend_global_flags |= PM_SUSPEND_FLAG_FW_RESUME; } static inline void pm_set_suspend_no_platform(void) { pm_suspend_global_flags |= PM_SUSPEND_FLAG_NO_PLATFORM; } /** * pm_suspend_via_firmware - Check if platform firmware will suspend the system. * * To be called during system-wide power management transitions to sleep states * or during the subsequent system-wide transitions back to the working state. * * Return 'true' if the platform firmware is going to be invoked at the end of * the system-wide power management transition (to a sleep state) in progress in * order to complete it, or if the platform firmware has been invoked in order * to complete the last (or preceding) transition of the system to a sleep * state. * * This matters if the caller needs or wants to carry out some special actions * depending on whether or not control will be passed to the platform firmware * subsequently (for example, the device may need to be reset before letting the * platform firmware manipulate it, which is not necessary when the platform * firmware is not going to be invoked) or when such special actions may have * been carried out during the preceding transition of the system to a sleep * state (as they may need to be taken into account). */ static inline bool pm_suspend_via_firmware(void) { return !!(pm_suspend_global_flags & PM_SUSPEND_FLAG_FW_SUSPEND); } /** * pm_resume_via_firmware - Check if platform firmware has woken up the system. * * To be called during system-wide power management transitions from sleep * states. * * Return 'true' if the platform firmware has passed control to the kernel at * the beginning of the system-wide power management transition in progress, so * the event that woke up the system from sleep has been handled by the platform * firmware. */ static inline bool pm_resume_via_firmware(void) { return !!(pm_suspend_global_flags & PM_SUSPEND_FLAG_FW_RESUME); } /** * pm_suspend_no_platform - Check if platform may change device power states. * * To be called during system-wide power management transitions to sleep states * or during the subsequent system-wide transitions back to the working state. * * Return 'true' if the power states of devices remain under full control of the * kernel throughout the system-wide suspend and resume cycle in progress (that * is, if a device is put into a certain power state during suspend, it can be * expected to remain in that state during resume). */ static inline bool pm_suspend_no_platform(void) { return !!(pm_suspend_global_flags & PM_SUSPEND_FLAG_NO_PLATFORM); } /* Suspend-to-idle state machnine. */ enum s2idle_states { S2IDLE_STATE_NONE, /* Not suspended/suspending. */ S2IDLE_STATE_ENTER, /* Enter suspend-to-idle. */ S2IDLE_STATE_WAKE, /* Wake up from suspend-to-idle. */ }; extern enum s2idle_states __read_mostly s2idle_state; static inline bool idle_should_enter_s2idle(void) { return unlikely(s2idle_state == S2IDLE_STATE_ENTER); } extern bool pm_suspend_default_s2idle(void); extern void __init pm_states_init(void); extern void s2idle_set_ops(const struct platform_s2idle_ops *ops); extern void s2idle_wake(void); /** * arch_suspend_disable_irqs - disable IRQs for suspend * * Disables IRQs (in the default case). This is a weak symbol in the common * code and thus allows architectures to override it if more needs to be * done. Not called for suspend to disk. */ extern void arch_suspend_disable_irqs(void); /** * arch_suspend_enable_irqs - enable IRQs after suspend * * Enables IRQs (in the default case). This is a weak symbol in the common * code and thus allows architectures to override it if more needs to be * done. Not called for suspend to disk. */ extern void arch_suspend_enable_irqs(void); extern int pm_suspend(suspend_state_t state); extern bool sync_on_suspend_enabled; #else /* !CONFIG_SUSPEND */ #define suspend_valid_only_mem NULL #define pm_suspend_target_state (PM_SUSPEND_ON) static inline void pm_suspend_clear_flags(void) {} static inline void pm_set_suspend_via_firmware(void) {} static inline void pm_set_resume_via_firmware(void) {} static inline bool pm_suspend_via_firmware(void) { return false; } static inline bool pm_resume_via_firmware(void) { return false; } static inline bool pm_suspend_no_platform(void) { return false; } static inline bool pm_suspend_default_s2idle(void) { return false; } static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {} static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; } static inline bool sync_on_suspend_enabled(void) { return true; } static inline bool idle_should_enter_s2idle(void) { return false; } static inline void __init pm_states_init(void) {} static inline void s2idle_set_ops(const struct platform_s2idle_ops *ops) {} static inline void s2idle_wake(void) {} #endif /* !CONFIG_SUSPEND */ static inline bool pm_suspend_in_progress(void) { return pm_suspend_target_state != PM_SUSPEND_ON; } /* struct pbe is used for creating lists of pages that should be restored * atomically during the resume from disk, because the page frames they have * occupied before the suspend are in use. */ struct pbe { void *address; /* address of the copy */ void *orig_address; /* original address of a page */ struct pbe *next; }; /** * struct platform_hibernation_ops - hibernation platform support * * The methods in this structure allow a platform to carry out special * operations required by it during a hibernation transition. * * All the methods below, except for @recover(), must be implemented. * * @begin: Tell the platform driver that we're starting hibernation. * Called right after shrinking memory and before freezing devices. * * @end: Called by the PM core right after resuming devices, to indicate to * the platform that the system has returned to the working state. * * @pre_snapshot: Prepare the platform for creating the hibernation image. * Called right after devices have been frozen and before the nonboot * CPUs are disabled (runs with IRQs on). * * @finish: Restore the previous state of the platform after the hibernation * image has been created *or* put the platform into the normal operation * mode after the hibernation (the same method is executed in both cases). * Called right after the nonboot CPUs have been enabled and before * thawing devices (runs with IRQs on). * * @prepare: Prepare the platform for entering the low power state. * Called right after the hibernation image has been saved and before * devices are prepared for entering the low power state. * * @enter: Put the system into the low power state after the hibernation image * has been saved to disk. * Called after the nonboot CPUs have been disabled and all of the low * level devices have been shut down (runs with IRQs off). * * @leave: Perform the first stage of the cleanup after the system sleep state * indicated by @set_target() has been left. * Called right after the control has been passed from the boot kernel to * the image kernel, before the nonboot CPUs are enabled and before devices * are resumed. Executed with interrupts disabled. * * @pre_restore: Prepare system for the restoration from a hibernation image. * Called right after devices have been frozen and before the nonboot * CPUs are disabled (runs with IRQs on). * * @restore_cleanup: Clean up after a failing image restoration. * Called right after the nonboot CPUs have been enabled and before * thawing devices (runs with IRQs on). * * @recover: Recover the platform from a failure to suspend devices. * Called by the PM core if the suspending of devices during hibernation * fails. This callback is optional and should only be implemented by * platforms which require special recovery actions in that situation. */ struct platform_hibernation_ops { int (*begin)(pm_message_t stage); void (*end)(void); int (*pre_snapshot)(void); void (*finish)(void); int (*prepare)(void); int (*enter)(void); void (*leave)(void); int (*pre_restore)(void); void (*restore_cleanup)(void); void (*recover)(void); }; #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ extern void register_nosave_region(unsigned long b, unsigned long e); extern int swsusp_page_is_forbidden(struct page *); extern void swsusp_set_page_free(struct page *); extern void swsusp_unset_page_free(struct page *); extern unsigned long get_safe_page(gfp_t gfp_mask); extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); extern u32 swsusp_hardware_signature; extern void hibernation_set_ops(const struct platform_hibernation_ops *ops); extern int hibernate(void); extern bool system_entering_hibernation(void); extern bool hibernation_available(void); asmlinkage int swsusp_save(void); extern struct pbe *restore_pblist; int pfn_is_nosave(unsigned long pfn); int hibernate_quiet_exec(int (*func)(void *data), void *data); int hibernate_resume_nonboot_cpu_disable(void); int arch_hibernation_header_save(void *addr, unsigned int max_size); int arch_hibernation_header_restore(void *addr); #else /* CONFIG_HIBERNATION */ static inline void register_nosave_region(unsigned long b, unsigned long e) {} static inline int swsusp_page_is_forbidden(struct page *p) { return 0; } static inline void swsusp_set_page_free(struct page *p) {} static inline void swsusp_unset_page_free(struct page *p) {} static inline void hibernation_set_ops(const struct platform_hibernation_ops *ops) {} static inline int hibernate(void) { return -ENOSYS; } static inline bool system_entering_hibernation(void) { return false; } static inline bool hibernation_available(void) { return false; } static inline int hibernate_quiet_exec(int (*func)(void *data), void *data) { return -ENOTSUPP; } #endif /* CONFIG_HIBERNATION */ #if defined(CONFIG_HIBERNATION) && defined(CONFIG_SUSPEND) bool pm_hibernation_mode_is_suspend(void); #else static inline bool pm_hibernation_mode_is_suspend(void) { return false; } #endif int arch_resume_nosmt(void); #ifdef CONFIG_HIBERNATION_SNAPSHOT_DEV int is_hibernate_resume_dev(dev_t dev); #else static inline int is_hibernate_resume_dev(dev_t dev) { return 0; } #endif /* Hibernation and suspend events */ #define PM_HIBERNATION_PREPARE 0x0001 /* Going to hibernate */ #define PM_POST_HIBERNATION 0x0002 /* Hibernation finished */ #define PM_SUSPEND_PREPARE 0x0003 /* Going to suspend the system */ #define PM_POST_SUSPEND 0x0004 /* Suspend finished */ #define PM_RESTORE_PREPARE 0x0005 /* Going to restore a saved image */ #define PM_POST_RESTORE 0x0006 /* Restore failed */ extern struct mutex system_transition_mutex; #ifdef CONFIG_PM_SLEEP void save_processor_state(void); void restore_processor_state(void); /* kernel/power/main.c */ extern int register_pm_notifier(struct notifier_block *nb); extern int unregister_pm_notifier(struct notifier_block *nb); extern void ksys_sync_helper(void); extern void pm_report_hw_sleep_time(u64 t); extern void pm_report_max_hw_sleep(u64 t); void pm_restrict_gfp_mask(void); void pm_restore_gfp_mask(void); #define pm_notifier(fn, pri) { \ static struct notifier_block fn##_nb = \ { .notifier_call = fn, .priority = pri }; \ register_pm_notifier(&fn##_nb); \ } /* drivers/base/power/wakeup.c */ extern bool events_check_enabled; static inline bool pm_suspended_storage(void) { return !gfp_has_io_fs(gfp_allowed_mask); } extern bool pm_wakeup_pending(void); extern void pm_system_wakeup(void); extern void pm_system_cancel_wakeup(void); extern void pm_wakeup_clear(unsigned int irq_number); extern void pm_system_irq_wakeup(unsigned int irq_number); extern unsigned int pm_wakeup_irq(void); extern bool pm_get_wakeup_count(unsigned int *count, bool block); extern bool pm_save_wakeup_count(unsigned int count); extern void pm_wakep_autosleep_enabled(bool set); extern void pm_print_active_wakeup_sources(void); extern unsigned int lock_system_sleep(void); extern void unlock_system_sleep(unsigned int); extern bool pm_sleep_transition_in_progress(void); bool pm_hibernate_is_recovering(void); #else /* !CONFIG_PM_SLEEP */ static inline int register_pm_notifier(struct notifier_block *nb) { return 0; } static inline int unregister_pm_notifier(struct notifier_block *nb) { return 0; } static inline void pm_report_hw_sleep_time(u64 t) {}; static inline void pm_report_max_hw_sleep(u64 t) {}; static inline void pm_restrict_gfp_mask(void) {} static inline void pm_restore_gfp_mask(void) {} static inline void ksys_sync_helper(void) {} #define pm_notifier(fn, pri) do { (void)(fn); } while (0) static inline bool pm_suspended_storage(void) { return false; } static inline bool pm_wakeup_pending(void) { return false; } static inline void pm_system_wakeup(void) {} static inline void pm_wakeup_clear(bool reset) {} static inline void pm_system_irq_wakeup(unsigned int irq_number) {} static inline unsigned int lock_system_sleep(void) { return 0; } static inline void unlock_system_sleep(unsigned int flags) {} static inline bool pm_sleep_transition_in_progress(void) { return false; } static inline bool pm_hibernate_is_recovering(void) { return false; } #endif /* !CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG extern bool pm_print_times_enabled; extern bool pm_debug_messages_on; extern bool pm_debug_messages_should_print(void); static inline int pm_dyn_debug_messages_on(void) { #ifdef CONFIG_DYNAMIC_DEBUG return 1; #else return 0; #endif } #ifndef pr_fmt #define pr_fmt(fmt) "PM: " fmt #endif #define __pm_pr_dbg(fmt, ...) \ do { \ if (pm_debug_messages_should_print()) \ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ else if (pm_dyn_debug_messages_on()) \ pr_debug(fmt, ##__VA_ARGS__); \ } while (0) #define __pm_deferred_pr_dbg(fmt, ...) \ do { \ if (pm_debug_messages_should_print()) \ printk_deferred(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ } while (0) #else #define pm_print_times_enabled (false) #define pm_debug_messages_on (false) #include <linux/printk.h> #define __pm_pr_dbg(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #define __pm_deferred_pr_dbg(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /** * pm_pr_dbg - print pm sleep debug messages * * If pm_debug_messages_on is enabled and the system is entering/leaving * suspend, print message. * If pm_debug_messages_on is disabled and CONFIG_DYNAMIC_DEBUG is enabled, * print message only from instances explicitly enabled on dynamic debug's * control. * If pm_debug_messages_on is disabled and CONFIG_DYNAMIC_DEBUG is disabled, * don't print message. */ #define pm_pr_dbg(fmt, ...) \ __pm_pr_dbg(fmt, ##__VA_ARGS__) #define pm_deferred_pr_dbg(fmt, ...) \ __pm_deferred_pr_dbg(fmt, ##__VA_ARGS__) #ifdef CONFIG_PM_AUTOSLEEP /* kernel/power/autosleep.c */ void queue_up_suspend_work(void); #else /* !CONFIG_PM_AUTOSLEEP */ static inline void queue_up_suspend_work(void) {} #endif /* !CONFIG_PM_AUTOSLEEP */ enum suspend_stat_step { SUSPEND_WORKING = 0, SUSPEND_FREEZE, SUSPEND_PREPARE, SUSPEND_SUSPEND, SUSPEND_SUSPEND_LATE, SUSPEND_SUSPEND_NOIRQ, SUSPEND_RESUME_NOIRQ, SUSPEND_RESUME_EARLY, SUSPEND_RESUME }; void dpm_save_failed_dev(const char *name); void dpm_save_failed_step(enum suspend_stat_step step); #endif /* _LINUX_SUSPEND_H */
16 15 14 13 1 1 19 19 3 3 2 1 16 12 11 11 10 1 9 1 19 14 11 10 10 10 1 1 1 2 2 2 1 3 2 1 5 8 6 8 7 1 6 2 5 9 4 8 9 8 4 4 8 7 4 5 5 5 1 4 4 1 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 // SPDX-License-Identifier: GPL-2.0-only /* * Landlock - System call implementations and user space interfaces * * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI * Copyright © 2021-2025 Microsoft Corporation */ #include <asm/current.h> #include <linux/anon_inodes.h> #include <linux/bitops.h> #include <linux/build_bug.h> #include <linux/capability.h> #include <linux/cleanup.h> #include <linux/compiler_types.h> #include <linux/dcache.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/limits.h> #include <linux/mount.h> #include <linux/path.h> #include <linux/sched.h> #include <linux/security.h> #include <linux/stddef.h> #include <linux/syscalls.h> #include <linux/types.h> #include <linux/uaccess.h> #include <uapi/linux/landlock.h> #include "cred.h" #include "domain.h" #include "fs.h" #include "limits.h" #include "net.h" #include "ruleset.h" #include "setup.h" static bool is_initialized(void) { if (likely(landlock_initialized)) return true; pr_warn_once( "Disabled but requested by user space. " "You should enable Landlock at boot time: " "https://docs.kernel.org/userspace-api/landlock.html#boot-time-configuration\n"); return false; } /** * copy_min_struct_from_user - Safe future-proof argument copying * * Extend copy_struct_from_user() to check for consistent user buffer. * * @dst: Kernel space pointer or NULL. * @ksize: Actual size of the data pointed to by @dst. * @ksize_min: Minimal required size to be copied. * @src: User space pointer or NULL. * @usize: (Alleged) size of the data pointed to by @src. */ static __always_inline int copy_min_struct_from_user(void *const dst, const size_t ksize, const size_t ksize_min, const void __user *const src, const size_t usize) { /* Checks buffer inconsistencies. */ BUILD_BUG_ON(!dst); if (!src) return -EFAULT; /* Checks size ranges. */ BUILD_BUG_ON(ksize <= 0); BUILD_BUG_ON(ksize < ksize_min); if (usize < ksize_min) return -EINVAL; if (usize > PAGE_SIZE) return -E2BIG; /* Copies user buffer and fills with zeros. */ return copy_struct_from_user(dst, ksize, src, usize); } /* * This function only contains arithmetic operations with constants, leading to * BUILD_BUG_ON(). The related code is evaluated and checked at build time, * but it is then ignored thanks to compiler optimizations. */ static void build_check_abi(void) { struct landlock_ruleset_attr ruleset_attr; struct landlock_path_beneath_attr path_beneath_attr; struct landlock_net_port_attr net_port_attr; size_t ruleset_size, path_beneath_size, net_port_size; /* * For each user space ABI structures, first checks that there is no * hole in them, then checks that all architectures have the same * struct size. */ ruleset_size = sizeof(ruleset_attr.handled_access_fs); ruleset_size += sizeof(ruleset_attr.handled_access_net); ruleset_size += sizeof(ruleset_attr.scoped); BUILD_BUG_ON(sizeof(ruleset_attr) != ruleset_size); BUILD_BUG_ON(sizeof(ruleset_attr) != 24); path_beneath_size = sizeof(path_beneath_attr.allowed_access); path_beneath_size += sizeof(path_beneath_attr.parent_fd); BUILD_BUG_ON(sizeof(path_beneath_attr) != path_beneath_size); BUILD_BUG_ON(sizeof(path_beneath_attr) != 12); net_port_size = sizeof(net_port_attr.allowed_access); net_port_size += sizeof(net_port_attr.port); BUILD_BUG_ON(sizeof(net_port_attr) != net_port_size); BUILD_BUG_ON(sizeof(net_port_attr) != 16); } /* Ruleset handling */ static int fop_ruleset_release(struct inode *const inode, struct file *const filp) { struct landlock_ruleset *ruleset = filp->private_data; landlock_put_ruleset(ruleset); return 0; } static ssize_t fop_dummy_read(struct file *const filp, char __user *const buf, const size_t size, loff_t *const ppos) { /* Dummy handler to enable FMODE_CAN_READ. */ return -EINVAL; } static ssize_t fop_dummy_write(struct file *const filp, const char __user *const buf, const size_t size, loff_t *const ppos) { /* Dummy handler to enable FMODE_CAN_WRITE. */ return -EINVAL; } /* * A ruleset file descriptor enables to build a ruleset by adding (i.e. * writing) rule after rule, without relying on the task's context. This * reentrant design is also used in a read way to enforce the ruleset on the * current task. */ static const struct file_operations ruleset_fops = { .release = fop_ruleset_release, .read = fop_dummy_read, .write = fop_dummy_write, }; /* * The Landlock ABI version should be incremented for each new Landlock-related * user space visible change (e.g. Landlock syscalls). This version should * only be incremented once per Linux release, and the date in * Documentation/userspace-api/landlock.rst should be updated to reflect the * UAPI change. */ const int landlock_abi_version = 7; /** * sys_landlock_create_ruleset - Create a new ruleset * * @attr: Pointer to a &struct landlock_ruleset_attr identifying the scope of * the new ruleset. * @size: Size of the pointed &struct landlock_ruleset_attr (needed for * backward and forward compatibility). * @flags: Supported values: * * - %LANDLOCK_CREATE_RULESET_VERSION * - %LANDLOCK_CREATE_RULESET_ERRATA * * This system call enables to create a new Landlock ruleset, and returns the * related file descriptor on success. * * If %LANDLOCK_CREATE_RULESET_VERSION or %LANDLOCK_CREATE_RULESET_ERRATA is * set, then @attr must be NULL and @size must be 0. * * Possible returned errors are: * * - %EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time; * - %EINVAL: unknown @flags, or unknown access, or unknown scope, or too small @size; * - %E2BIG: @attr or @size inconsistencies; * - %EFAULT: @attr or @size inconsistencies; * - %ENOMSG: empty &landlock_ruleset_attr.handled_access_fs. * * .. kernel-doc:: include/uapi/linux/landlock.h * :identifiers: landlock_create_ruleset_flags */ SYSCALL_DEFINE3(landlock_create_ruleset, const struct landlock_ruleset_attr __user *const, attr, const size_t, size, const __u32, flags) { struct landlock_ruleset_attr ruleset_attr; struct landlock_ruleset *ruleset; int err, ruleset_fd; /* Build-time checks. */ build_check_abi(); if (!is_initialized()) return -EOPNOTSUPP; if (flags) { if (attr || size) return -EINVAL; if (flags == LANDLOCK_CREATE_RULESET_VERSION) return landlock_abi_version; if (flags == LANDLOCK_CREATE_RULESET_ERRATA) return landlock_errata; return -EINVAL; } /* Copies raw user space buffer. */ err = copy_min_struct_from_user(&ruleset_attr, sizeof(ruleset_attr), offsetofend(typeof(ruleset_attr), handled_access_fs), attr, size); if (err) return err; /* Checks content (and 32-bits cast). */ if ((ruleset_attr.handled_access_fs | LANDLOCK_MASK_ACCESS_FS) != LANDLOCK_MASK_ACCESS_FS) return -EINVAL; /* Checks network content (and 32-bits cast). */ if ((ruleset_attr.handled_access_net | LANDLOCK_MASK_ACCESS_NET) != LANDLOCK_MASK_ACCESS_NET) return -EINVAL; /* Checks IPC scoping content (and 32-bits cast). */ if ((ruleset_attr.scoped | LANDLOCK_MASK_SCOPE) != LANDLOCK_MASK_SCOPE) return -EINVAL; /* Checks arguments and transforms to kernel struct. */ ruleset = landlock_create_ruleset(ruleset_attr.handled_access_fs, ruleset_attr.handled_access_net, ruleset_attr.scoped); if (IS_ERR(ruleset)) return PTR_ERR(ruleset); /* Creates anonymous FD referring to the ruleset. */ ruleset_fd = anon_inode_getfd("[landlock-ruleset]", &ruleset_fops, ruleset, O_RDWR | O_CLOEXEC); if (ruleset_fd < 0) landlock_put_ruleset(ruleset); return ruleset_fd; } /* * Returns an owned ruleset from a FD. It is thus needed to call * landlock_put_ruleset() on the return value. */ static struct landlock_ruleset *get_ruleset_from_fd(const int fd, const fmode_t mode) { CLASS(fd, ruleset_f)(fd); struct landlock_ruleset *ruleset; if (fd_empty(ruleset_f)) return ERR_PTR(-EBADF); /* Checks FD type and access right. */ if (fd_file(ruleset_f)->f_op != &ruleset_fops) return ERR_PTR(-EBADFD); if (!(fd_file(ruleset_f)->f_mode & mode)) return ERR_PTR(-EPERM); ruleset = fd_file(ruleset_f)->private_data; if (WARN_ON_ONCE(ruleset->num_layers != 1)) return ERR_PTR(-EINVAL); landlock_get_ruleset(ruleset); return ruleset; } /* Path handling */ /* * @path: Must call put_path(@path) after the call if it succeeded. */ static int get_path_from_fd(const s32 fd, struct path *const path) { CLASS(fd_raw, f)(fd); BUILD_BUG_ON(!__same_type( fd, ((struct landlock_path_beneath_attr *)NULL)->parent_fd)); if (fd_empty(f)) return -EBADF; /* * Forbids ruleset FDs, internal filesystems (e.g. nsfs), including * pseudo filesystems that will never be mountable (e.g. sockfs, * pipefs). */ if ((fd_file(f)->f_op == &ruleset_fops) || (fd_file(f)->f_path.mnt->mnt_flags & MNT_INTERNAL) || (fd_file(f)->f_path.dentry->d_sb->s_flags & SB_NOUSER) || IS_PRIVATE(d_backing_inode(fd_file(f)->f_path.dentry))) return -EBADFD; *path = fd_file(f)->f_path; path_get(path); return 0; } static int add_rule_path_beneath(struct landlock_ruleset *const ruleset, const void __user *const rule_attr) { struct landlock_path_beneath_attr path_beneath_attr; struct path path; int res, err; access_mask_t mask; /* Copies raw user space buffer. */ res = copy_from_user(&path_beneath_attr, rule_attr, sizeof(path_beneath_attr)); if (res) return -EFAULT; /* * Informs about useless rule: empty allowed_access (i.e. deny rules) * are ignored in path walks. */ if (!path_beneath_attr.allowed_access) return -ENOMSG; /* Checks that allowed_access matches the @ruleset constraints. */ mask = ruleset->access_masks[0].fs; if ((path_beneath_attr.allowed_access | mask) != mask) return -EINVAL; /* Gets and checks the new rule. */ err = get_path_from_fd(path_beneath_attr.parent_fd, &path); if (err) return err; /* Imports the new rule. */ err = landlock_append_fs_rule(ruleset, &path, path_beneath_attr.allowed_access); path_put(&path); return err; } static int add_rule_net_port(struct landlock_ruleset *ruleset, const void __user *const rule_attr) { struct landlock_net_port_attr net_port_attr; int res; access_mask_t mask; /* Copies raw user space buffer. */ res = copy_from_user(&net_port_attr, rule_attr, sizeof(net_port_attr)); if (res) return -EFAULT; /* * Informs about useless rule: empty allowed_access (i.e. deny rules) * are ignored by network actions. */ if (!net_port_attr.allowed_access) return -ENOMSG; /* Checks that allowed_access matches the @ruleset constraints. */ mask = landlock_get_net_access_mask(ruleset, 0); if ((net_port_attr.allowed_access | mask) != mask) return -EINVAL; /* Denies inserting a rule with port greater than 65535. */ if (net_port_attr.port > U16_MAX) return -EINVAL; /* Imports the new rule. */ return landlock_append_net_rule(ruleset, net_port_attr.port, net_port_attr.allowed_access); } /** * sys_landlock_add_rule - Add a new rule to a ruleset * * @ruleset_fd: File descriptor tied to the ruleset that should be extended * with the new rule. * @rule_type: Identify the structure type pointed to by @rule_attr: * %LANDLOCK_RULE_PATH_BENEATH or %LANDLOCK_RULE_NET_PORT. * @rule_attr: Pointer to a rule (matching the @rule_type). * @flags: Must be 0. * * This system call enables to define a new rule and add it to an existing * ruleset. * * Possible returned errors are: * * - %EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time; * - %EAFNOSUPPORT: @rule_type is %LANDLOCK_RULE_NET_PORT but TCP/IP is not * supported by the running kernel; * - %EINVAL: @flags is not 0; * - %EINVAL: The rule accesses are inconsistent (i.e. * &landlock_path_beneath_attr.allowed_access or * &landlock_net_port_attr.allowed_access is not a subset of the ruleset * handled accesses) * - %EINVAL: &landlock_net_port_attr.port is greater than 65535; * - %ENOMSG: Empty accesses (e.g. &landlock_path_beneath_attr.allowed_access is * 0); * - %EBADF: @ruleset_fd is not a file descriptor for the current thread, or a * member of @rule_attr is not a file descriptor as expected; * - %EBADFD: @ruleset_fd is not a ruleset file descriptor, or a member of * @rule_attr is not the expected file descriptor type; * - %EPERM: @ruleset_fd has no write access to the underlying ruleset; * - %EFAULT: @rule_attr was not a valid address. */ SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd, const enum landlock_rule_type, rule_type, const void __user *const, rule_attr, const __u32, flags) { struct landlock_ruleset *ruleset __free(landlock_put_ruleset) = NULL; if (!is_initialized()) return -EOPNOTSUPP; /* No flag for now. */ if (flags) return -EINVAL; /* Gets and checks the ruleset. */ ruleset = get_ruleset_from_fd(ruleset_fd, FMODE_CAN_WRITE); if (IS_ERR(ruleset)) return PTR_ERR(ruleset); switch (rule_type) { case LANDLOCK_RULE_PATH_BENEATH: return add_rule_path_beneath(ruleset, rule_attr); case LANDLOCK_RULE_NET_PORT: return add_rule_net_port(ruleset, rule_attr); default: return -EINVAL; } } /* Enforcement */ /** * sys_landlock_restrict_self - Enforce a ruleset on the calling thread * * @ruleset_fd: File descriptor tied to the ruleset to merge with the target. * @flags: Supported values: * * - %LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF * - %LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON * - %LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF * * This system call enables to enforce a Landlock ruleset on the current * thread. Enforcing a ruleset requires that the task has %CAP_SYS_ADMIN in its * namespace or is running with no_new_privs. This avoids scenarios where * unprivileged tasks can affect the behavior of privileged children. * * Possible returned errors are: * * - %EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time; * - %EINVAL: @flags contains an unknown bit. * - %EBADF: @ruleset_fd is not a file descriptor for the current thread; * - %EBADFD: @ruleset_fd is not a ruleset file descriptor; * - %EPERM: @ruleset_fd has no read access to the underlying ruleset, or the * current thread is not running with no_new_privs, or it doesn't have * %CAP_SYS_ADMIN in its namespace. * - %E2BIG: The maximum number of stacked rulesets is reached for the current * thread. * * .. kernel-doc:: include/uapi/linux/landlock.h * :identifiers: landlock_restrict_self_flags */ SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32, flags) { struct landlock_ruleset *new_dom, *ruleset __free(landlock_put_ruleset) = NULL; struct cred *new_cred; struct landlock_cred_security *new_llcred; bool __maybe_unused log_same_exec, log_new_exec, log_subdomains, prev_log_subdomains; if (!is_initialized()) return -EOPNOTSUPP; /* * Similar checks as for seccomp(2), except that an -EPERM may be * returned. */ if (!task_no_new_privs(current) && !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; if ((flags | LANDLOCK_MASK_RESTRICT_SELF) != LANDLOCK_MASK_RESTRICT_SELF) return -EINVAL; /* Translates "off" flag to boolean. */ log_same_exec = !(flags & LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF); /* Translates "on" flag to boolean. */ log_new_exec = !!(flags & LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON); /* Translates "off" flag to boolean. */ log_subdomains = !(flags & LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF); /* * It is allowed to set LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF with * -1 as ruleset_fd, but no other flag must be set. */ if (!(ruleset_fd == -1 && flags == LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF)) { /* Gets and checks the ruleset. */ ruleset = get_ruleset_from_fd(ruleset_fd, FMODE_CAN_READ); if (IS_ERR(ruleset)) return PTR_ERR(ruleset); } /* Prepares new credentials. */ new_cred = prepare_creds(); if (!new_cred) return -ENOMEM; new_llcred = landlock_cred(new_cred); #ifdef CONFIG_AUDIT prev_log_subdomains = !new_llcred->log_subdomains_off; new_llcred->log_subdomains_off = !prev_log_subdomains || !log_subdomains; #endif /* CONFIG_AUDIT */ /* * The only case when a ruleset may not be set is if * LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF is set and ruleset_fd is -1. * We could optimize this case by not calling commit_creds() if this flag * was already set, but it is not worth the complexity. */ if (!ruleset) return commit_creds(new_cred); /* * There is no possible race condition while copying and manipulating * the current credentials because they are dedicated per thread. */ new_dom = landlock_merge_ruleset(new_llcred->domain, ruleset); if (IS_ERR(new_dom)) { abort_creds(new_cred); return PTR_ERR(new_dom); } #ifdef CONFIG_AUDIT new_dom->hierarchy->log_same_exec = log_same_exec; new_dom->hierarchy->log_new_exec = log_new_exec; if ((!log_same_exec && !log_new_exec) || !prev_log_subdomains) new_dom->hierarchy->log_status = LANDLOCK_LOG_DISABLED; #endif /* CONFIG_AUDIT */ /* Replaces the old (prepared) domain. */ landlock_put_ruleset(new_llcred->domain); new_llcred->domain = new_dom; #ifdef CONFIG_AUDIT new_llcred->domain_exec |= BIT(new_dom->num_layers - 1); #endif /* CONFIG_AUDIT */ return commit_creds(new_cred); }
24 53 964 129 2 1 15 2 4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/userfaultfd_k.h * * Copyright (C) 2015 Red Hat, Inc. * */ #ifndef _LINUX_USERFAULTFD_K_H #define _LINUX_USERFAULTFD_K_H #ifdef CONFIG_USERFAULTFD #include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */ #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/leafops.h> #include <asm-generic/pgtable_uffd.h> #include <linux/hugetlb_inline.h> /* The set of all possible UFFD-related VM flags. */ #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining * new flags, since they might collide with O_* ones. We want * to re-use O_* flags that couldn't possibly have a meaning * from userfaultfd, in order to leave a free define-space for * shared O_* flags. */ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) /* * Start with fault_pending_wqh and fault_wqh so they're more likely * to be in the same cacheline. * * Locking order: * fd_wqh.lock * fault_pending_wqh.lock * fault_wqh.lock * event_wqh.lock * * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's * also taken in IRQ context. */ struct userfaultfd_ctx { /* waitqueue head for the pending (i.e. not read) userfaults */ wait_queue_head_t fault_pending_wqh; /* waitqueue head for the userfaults */ wait_queue_head_t fault_wqh; /* waitqueue head for the pseudo fd to wakeup poll/read */ wait_queue_head_t fd_wqh; /* waitqueue head for events */ wait_queue_head_t event_wqh; /* a refile sequence protected by fault_pending_wqh lock */ seqcount_spinlock_t refile_seq; /* pseudo fd refcounting */ refcount_t refcount; /* userfaultfd syscall flags */ unsigned int flags; /* features requested from the userspace */ unsigned int features; /* released */ bool released; /* * Prevents userfaultfd operations (fill/move/wp) from happening while * some non-cooperative event(s) is taking place. Increments are done * in write-mode. Whereas, userfaultfd operations, which includes * reading mmap_changing, is done under read-mode. */ struct rw_semaphore map_changing_lock; /* memory mappings are changing because of non-cooperative event */ atomic_t mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ struct mm_struct *mm; }; extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); /* A combined operation mode + behavior flags. */ typedef unsigned int __bitwise uffd_flags_t; /* Mutually exclusive modes of operation. */ enum mfill_atomic_mode { MFILL_ATOMIC_COPY, MFILL_ATOMIC_ZEROPAGE, MFILL_ATOMIC_CONTINUE, MFILL_ATOMIC_POISON, NR_MFILL_ATOMIC_MODES, }; #define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1) #define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr)) #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr)) #define MFILL_ATOMIC_MODE_MASK ((__force uffd_flags_t) (MFILL_ATOMIC_BIT(0) - 1)) static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected) { return (flags & MFILL_ATOMIC_MODE_MASK) == ((__force uffd_flags_t) expected); } static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode) { flags &= ~MFILL_ATOMIC_MODE_MASK; return flags | ((__force uffd_flags_t) mode); } /* Flags controlling behavior. These behavior changes are mode-independent. */ #define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0) extern int mfill_atomic_install_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, struct page *page, bool newly_allocated, uffd_flags_t flags); extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags); extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long len); extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long len, uffd_flags_t flags); extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, uffd_flags_t flags); extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, bool enable_wp); extern long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); /* move_pages */ void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, __u64 flags); int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long dst_addr, unsigned long src_addr); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) { return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx; } /* * Never enable huge pmd sharing on some uffd registered vmas: * * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry. * * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for * VMAs which share huge pmds. (If you have two mappings to the same * underlying pages, and fault in the non-UFFD-registered one with a write, * with huge pmd sharing this would *also* setup the second UFFD-registered * mapping, and we'd not get minor faults.) */ static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) { return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); } /* * Don't do fault around for either WP or MINOR registered uffd range. For * MINOR registered range, fault around will be a total disaster and ptes can * be installed without notifications; for WP it should mostly be fine as long * as the fault around checks for pte_none() before the installation, however * to be super safe we just forbid it. */ static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) { return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); } static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MISSING; } static inline bool userfaultfd_wp(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_WP; } static inline bool userfaultfd_minor(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MINOR; } static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { return userfaultfd_wp(vma) && pte_uffd_wp(pte); } static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, pmd_t pmd) { return userfaultfd_wp(vma) && pmd_uffd_wp(pmd); } static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return vma->vm_flags & __VM_UFFD_FLAGS; } static inline bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, bool wp_async) { vm_flags &= __VM_UFFD_FLAGS; if (vma->vm_flags & VM_DROPPABLE) return false; if ((vm_flags & VM_UFFD_MINOR) && (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) return false; /* * If wp async enabled, and WP is the only mode enabled, allow any * memory type. */ if (wp_async && (vm_flags == VM_UFFD_WP)) return true; /* * If user requested uffd-wp but not enabled pte markers for * uffd-wp, then shmem & hugetlbfs are not supported but only * anonymous. */ if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) return false; /* By default, allow any of anon|shmem|hugetlb */ return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || vma_is_shmem(vma); } static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) { struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx; return uffd_ctx && (uffd_ctx->features & UFFD_FEATURE_EVENT_REMAP) == 0; } extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); void dup_userfaultfd_fail(struct list_head *); extern void mremap_userfaultfd_prep(struct vm_area_struct *, struct vm_userfaultfd_ctx *); extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *, unsigned long from, unsigned long to, unsigned long len); void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *); extern bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf); extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); extern bool userfaultfd_wp_async(struct vm_area_struct *vma); void userfaultfd_reset_ctx(struct vm_area_struct *vma); struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end); int userfaultfd_register_range(struct userfaultfd_ctx *ctx, struct vm_area_struct *vma, vm_flags_t vm_flags, unsigned long start, unsigned long end, bool wp_async); void userfaultfd_release_new(struct userfaultfd_ctx *ctx); void userfaultfd_release_all(struct mm_struct *mm, struct userfaultfd_ctx *ctx); static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) { /* Only wr-protect mode uses pte markers */ if (!userfaultfd_wp(vma)) return false; /* File-based uffd-wp always need markers */ if (!vma_is_anonymous(vma)) return true; /* * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED * enabled (to apply markers on zero pages). */ return userfaultfd_wp_unpopulated(vma); } /* * Returns true if this is a swap pte and was uffd-wp wr-protected in either * forms (pte marker or a normal swap pte), false otherwise. */ static inline bool pte_swp_uffd_wp_any(pte_t pte) { if (!uffd_supports_wp_marker()) return false; if (pte_present(pte)) return false; if (pte_swp_uffd_wp(pte)) return true; if (pte_is_uffd_wp_marker(pte)) return true; return false; } #else /* CONFIG_USERFAULTFD */ /* mm helpers */ static inline vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) { return VM_FAULT_SIGBUS; } static inline long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp) { return false; } static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) { return true; } static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_wp(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_minor(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { return false; } static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, pmd_t pmd) { return false; } static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return false; } static inline int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *l) { return 0; } static inline void dup_userfaultfd_complete(struct list_head *l) { } static inline void dup_userfaultfd_fail(struct list_head *l) { } static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct vm_userfaultfd_ctx *ctx) { } static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx, unsigned long from, unsigned long to, unsigned long len) { } static inline void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *ctx) { } static inline bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end) { return true; } static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf) { return 0; } static inline void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) { } static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_wp_async(struct vm_area_struct *vma) { return false; } static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) { return false; } /* * Returns true if this is a swap pte and was uffd-wp wr-protected in either * forms (pte marker or a normal swap pte), false otherwise. */ static inline bool pte_swp_uffd_wp_any(pte_t pte) { return false; } #endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */
408 6 55 4371 686 272 4264 17 4247 4265 4294 4245 4247 4274 4285 48 49 1 24 49 50 3 3 3 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> linux/include/linux/rbtree.h To use rbtrees you'll have to implement your own insert and search cores. This will avoid us to use callbacks and to drop drammatically performances. I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef _LINUX_RBTREE_H #define _LINUX_RBTREE_H #include <linux/container_of.h> #include <linux/rbtree_types.h> #include <linux/stddef.h> #include <linux/rcupdate.h> #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ ((node)->__rb_parent_color == (unsigned long)(node)) #define RB_CLEAR_NODE(node) \ ((node)->__rb_parent_color = (unsigned long)(node)) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); /* * This function returns the first node (in sort order) of the tree. */ static inline struct rb_node *rb_first(const struct rb_root *root) { struct rb_node *n; n = root->rb_node; if (!n) return NULL; while (n->rb_left) n = n->rb_left; return n; } /* * This function returns the last node (in sort order) of the tree. */ static inline struct rb_node *rb_last(const struct rb_root *root) { struct rb_node *n; n = root->rb_node; if (!n) return NULL; while (n->rb_right) n = n->rb_right; return n; } /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; rcu_assign_pointer(*rb_link, node); } #define rb_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? rb_entry(____ptr, type, member) : NULL; \ }) /** * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of * given type allowing the backing memory of @pos to be invalidated * * @pos: the 'type *' to use as a loop cursor. * @n: another 'type *' to use as temporary storage * @root: 'rb_root *' of the rbtree. * @field: the name of the rb_node field within 'type'. * * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as * list_for_each_entry_safe() and allows the iteration to continue independent * of changes to @pos by the body of the loop. * * Note, however, that it cannot handle other modifications that re-order the * rbtree it is iterating over. This includes calling rb_erase() on @pos, as * rb_erase() may rebalance the tree, causing us to miss some nodes. */ #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ typeof(*pos), field); 1; }); \ pos = n) /* Same as rb_first(), but O(1) */ #define rb_first_cached(root) (root)->rb_leftmost static inline void rb_insert_color_cached(struct rb_node *node, struct rb_root_cached *root, bool leftmost) { if (leftmost) root->rb_leftmost = node; rb_insert_color(node, &root->rb_root); } static inline struct rb_node * rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) { struct rb_node *leftmost = NULL; if (root->rb_leftmost == node) leftmost = root->rb_leftmost = rb_next(node); rb_erase(node, &root->rb_root); return leftmost; } static inline void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, struct rb_root_cached *root) { if (root->rb_leftmost == victim) root->rb_leftmost = new; rb_replace_node(victim, new, &root->rb_root); } /* * The below helper functions use 2 operators with 3 different * calling conventions. The operators are related like: * * comp(a->key,b) < 0 := less(a,b) * comp(a->key,b) > 0 := less(b,a) * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) * * If these operators define a partial order on the elements we make no * guarantee on which of the elements matching the key is found. See * rb_find(). * * The reason for this is to allow the find() interface without requiring an * on-stack dummy object, which might not be feasible due to object size. */ /** * rb_add_cached() - insert @node into the leftmost cached tree @tree * @node: node to insert * @tree: leftmost cached tree to insert @node into * @less: operator defining the (partial) node order * * Returns @node when it is the new leftmost, or NULL. */ static __always_inline struct rb_node * rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { parent = *link; if (less(node, parent)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return leftmost ? node : NULL; } /** * rb_add() - insert @node into @tree * @node: node to insert * @tree: tree to insert @node into * @less: operator defining the (partial) node order */ static __always_inline void rb_add(struct rb_node *node, struct rb_root *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; while (*link) { parent = *link; if (less(node, parent)) link = &parent->rb_left; else link = &parent->rb_right; } rb_link_node(node, parent, link); rb_insert_color(node, tree); } /** * rb_find_add_cached() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add_cached(struct rb_node *node, struct rb_root_cached *tree, int (*cmp)(const struct rb_node *new, const struct rb_node *exist)) { bool leftmost = true; struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) { link = &parent->rb_left; } else if (c > 0) { link = &parent->rb_right; leftmost = false; } else { return parent; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return NULL; } /** * rb_find_add() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find_add_rcu() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Adds a Store-Release for link_node. * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add_rcu(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node_rcu(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = node->rb_left; else if (c > 0) node = node->rb_right; else return node; } return NULL; } /** * rb_find_rcu() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Notably, tree descent vs concurrent tree rotations is unsound and can result * in false-negatives. * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find_rcu(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = rcu_dereference_raw(node->rb_left); else if (c > 0) node = rcu_dereference_raw(node->rb_right); else return node; } return NULL; } /** * rb_find_first() - find the first @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the leftmost node matching @key, or NULL. */ static __always_inline struct rb_node * rb_find_first(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; struct rb_node *match = NULL; while (node) { int c = cmp(key, node); if (c <= 0) { if (!c) match = node; node = node->rb_left; } else if (c > 0) { node = node->rb_right; } } return match; } /** * rb_next_match() - find the next @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the next node matching @key, or NULL. */ static __always_inline struct rb_node * rb_next_match(const void *key, struct rb_node *node, int (*cmp)(const void *key, const struct rb_node *)) { node = rb_next(node); if (node && cmp(key, node)) node = NULL; return node; } /** * rb_for_each() - iterates a subtree matching @key * @node: iterator * @key: key to match * @tree: tree to search * @cmp: operator defining node order */ #define rb_for_each(node, key, tree, cmp) \ for ((node) = rb_find_first((key), (tree), (cmp)); \ (node); (node) = rb_next_match((key), (node), (cmp))) #endif /* _LINUX_RBTREE_H */
12 12 11 11 1 1 1 1 1 1 12 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/types.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/tty.h> #include <linux/fcntl.h> #include <linux/uaccess.h> #include "tty.h" static int is_ignored(int sig) { return (sigismember(&current->blocked, sig) || current->sighand->action[sig-1].sa.sa_handler == SIG_IGN); } /** * __tty_check_change - check for POSIX terminal changes * @tty: tty to check * @sig: signal to send * * If we try to write to, or set the state of, a terminal and we're * not in the foreground, send a SIGTTOU. If the signal is blocked or * ignored, go ahead and perform the operation. (POSIX 7.2) * * Locking: ctrl.lock */ int __tty_check_change(struct tty_struct *tty, int sig) { unsigned long flags; struct pid *pgrp, *tty_pgrp; int ret = 0; if (current->signal->tty != tty) return 0; rcu_read_lock(); pgrp = task_pgrp(current); spin_lock_irqsave(&tty->ctrl.lock, flags); tty_pgrp = tty->ctrl.pgrp; spin_unlock_irqrestore(&tty->ctrl.lock, flags); if (tty_pgrp && pgrp != tty_pgrp) { if (is_ignored(sig)) { if (sig == SIGTTIN) ret = -EIO; } else if (is_current_pgrp_orphaned()) ret = -EIO; else { kill_pgrp(pgrp, sig, 1); set_thread_flag(TIF_SIGPENDING); ret = -ERESTARTSYS; } } rcu_read_unlock(); if (!tty_pgrp) tty_warn(tty, "sig=%d, tty->pgrp == NULL!\n", sig); return ret; } int tty_check_change(struct tty_struct *tty) { return __tty_check_change(tty, SIGTTOU); } EXPORT_SYMBOL(tty_check_change); void proc_clear_tty(struct task_struct *p) { unsigned long flags; struct tty_struct *tty; spin_lock_irqsave(&p->sighand->siglock, flags); tty = p->signal->tty; p->signal->tty = NULL; spin_unlock_irqrestore(&p->sighand->siglock, flags); tty_kref_put(tty); } /** * __proc_set_tty - set the controlling terminal * @tty: tty structure * * Only callable by the session leader and only if it does not already have * a controlling terminal. * * Caller must hold: tty_lock() * a readlock on tasklist_lock * sighand lock */ static void __proc_set_tty(struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&tty->ctrl.lock, flags); /* * The session and fg pgrp references will be non-NULL if * tiocsctty() is stealing the controlling tty */ put_pid(tty->ctrl.session); put_pid(tty->ctrl.pgrp); tty->ctrl.pgrp = get_pid(task_pgrp(current)); tty->ctrl.session = get_pid(task_session(current)); spin_unlock_irqrestore(&tty->ctrl.lock, flags); if (current->signal->tty) { tty_debug(tty, "current tty %s not NULL!!\n", current->signal->tty->name); tty_kref_put(current->signal->tty); } put_pid(current->signal->tty_old_pgrp); current->signal->tty = tty_kref_get(tty); current->signal->tty_old_pgrp = NULL; } static void proc_set_tty(struct tty_struct *tty) { spin_lock_irq(&current->sighand->siglock); __proc_set_tty(tty); spin_unlock_irq(&current->sighand->siglock); } /* * Called by tty_open() to set the controlling tty if applicable. */ void tty_open_proc_set_tty(struct file *filp, struct tty_struct *tty) { read_lock(&tasklist_lock); spin_lock_irq(&current->sighand->siglock); if (current->signal->leader && !current->signal->tty && tty->ctrl.session == NULL) { /* * Don't let a process that only has write access to the tty * obtain the privileges associated with having a tty as * controlling terminal (being able to reopen it with full * access through /dev/tty, being able to perform pushback). * Many distributions set the group of all ttys to "tty" and * grant write-only access to all terminals for setgid tty * binaries, which should not imply full privileges on all ttys. * * This could theoretically break old code that performs open() * on a write-only file descriptor. In that case, it might be * necessary to also permit this if * inode_permission(inode, MAY_READ) == 0. */ if (filp->f_mode & FMODE_READ) __proc_set_tty(tty); } spin_unlock_irq(&current->sighand->siglock); read_unlock(&tasklist_lock); } struct tty_struct *get_current_tty(void) { struct tty_struct *tty; unsigned long flags; spin_lock_irqsave(&current->sighand->siglock, flags); tty = tty_kref_get(current->signal->tty); spin_unlock_irqrestore(&current->sighand->siglock, flags); return tty; } EXPORT_SYMBOL_GPL(get_current_tty); /* * Called from tty_release(). */ void session_clear_tty(struct pid *session) { struct task_struct *p; do_each_pid_task(session, PIDTYPE_SID, p) { proc_clear_tty(p); } while_each_pid_task(session, PIDTYPE_SID, p); } /** * tty_signal_session_leader - sends SIGHUP to session leader * @tty: controlling tty * @exit_session: if non-zero, signal all foreground group processes * * Send SIGHUP and SIGCONT to the session leader and its process group. * Optionally, signal all processes in the foreground process group. * * Returns the number of processes in the session with this tty * as their controlling terminal. This value is used to drop * tty references for those processes. */ int tty_signal_session_leader(struct tty_struct *tty, int exit_session) { struct task_struct *p; int refs = 0; struct pid *tty_pgrp = NULL; read_lock(&tasklist_lock); if (tty->ctrl.session) { do_each_pid_task(tty->ctrl.session, PIDTYPE_SID, p) { spin_lock_irq(&p->sighand->siglock); if (p->signal->tty == tty) { p->signal->tty = NULL; /* * We defer the dereferences outside of * the tasklist lock. */ refs++; } if (!p->signal->leader) { spin_unlock_irq(&p->sighand->siglock); continue; } send_signal_locked(SIGHUP, SEND_SIG_PRIV, p, PIDTYPE_TGID); send_signal_locked(SIGCONT, SEND_SIG_PRIV, p, PIDTYPE_TGID); put_pid(p->signal->tty_old_pgrp); /* A noop */ spin_lock(&tty->ctrl.lock); tty_pgrp = get_pid(tty->ctrl.pgrp); if (tty->ctrl.pgrp) p->signal->tty_old_pgrp = get_pid(tty->ctrl.pgrp); spin_unlock(&tty->ctrl.lock); spin_unlock_irq(&p->sighand->siglock); } while_each_pid_task(tty->ctrl.session, PIDTYPE_SID, p); } read_unlock(&tasklist_lock); if (tty_pgrp) { if (exit_session) kill_pgrp(tty_pgrp, SIGHUP, exit_session); put_pid(tty_pgrp); } return refs; } /** * disassociate_ctty - disconnect controlling tty * @on_exit: true if exiting so need to "hang up" the session * * This function is typically called only by the session leader, when * it wants to disassociate itself from its controlling tty. * * It performs the following functions: * (1) Sends a SIGHUP and SIGCONT to the foreground process group * (2) Clears the tty from being controlling the session * (3) Clears the controlling tty for all processes in the * session group. * * The argument on_exit is set to 1 if called when a process is * exiting; it is 0 if called by the ioctl TIOCNOTTY. * * Locking: * BTM is taken for hysterical raisons, and held when * called from no_tty(). * tty_mutex is taken to protect tty * ->siglock is taken to protect ->signal/->sighand * tasklist_lock is taken to walk process list for sessions * ->siglock is taken to protect ->signal/->sighand */ void disassociate_ctty(int on_exit) { struct tty_struct *tty; if (!current->signal->leader) return; tty = get_current_tty(); if (tty) { if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY) { tty_vhangup_session(tty); } else { struct pid *tty_pgrp = tty_get_pgrp(tty); if (tty_pgrp) { kill_pgrp(tty_pgrp, SIGHUP, on_exit); if (!on_exit) kill_pgrp(tty_pgrp, SIGCONT, on_exit); put_pid(tty_pgrp); } } tty_kref_put(tty); } else if (on_exit) { struct pid *old_pgrp; spin_lock_irq(&current->sighand->siglock); old_pgrp = current->signal->tty_old_pgrp; current->signal->tty_old_pgrp = NULL; spin_unlock_irq(&current->sighand->siglock); if (old_pgrp) { kill_pgrp(old_pgrp, SIGHUP, on_exit); kill_pgrp(old_pgrp, SIGCONT, on_exit); put_pid(old_pgrp); } return; } tty = get_current_tty(); if (tty) { unsigned long flags; tty_lock(tty); spin_lock_irqsave(&tty->ctrl.lock, flags); put_pid(tty->ctrl.session); put_pid(tty->ctrl.pgrp); tty->ctrl.session = NULL; tty->ctrl.pgrp = NULL; spin_unlock_irqrestore(&tty->ctrl.lock, flags); tty_unlock(tty); tty_kref_put(tty); } /* If tty->ctrl.pgrp is not NULL, it may be assigned to * current->signal->tty_old_pgrp in a race condition, and * cause pid memleak. Release current->signal->tty_old_pgrp * after tty->ctrl.pgrp set to NULL. */ spin_lock_irq(&current->sighand->siglock); put_pid(current->signal->tty_old_pgrp); current->signal->tty_old_pgrp = NULL; spin_unlock_irq(&current->sighand->siglock); /* Now clear signal->tty under the lock */ read_lock(&tasklist_lock); session_clear_tty(task_session(current)); read_unlock(&tasklist_lock); } /* * * no_tty - Ensure the current process does not have a controlling tty */ void no_tty(void) { /* * FIXME: Review locking here. The tty_lock never covered any race * between a new association and proc_clear_tty but possibly we need * to protect against this anyway. */ struct task_struct *tsk = current; disassociate_ctty(0); proc_clear_tty(tsk); } /** * tiocsctty - set controlling tty * @tty: tty structure * @file: file structure used to check permissions * @arg: user argument * * This ioctl is used to manage job control. It permits a session * leader to set this tty as the controlling tty for the session. * * Locking: * Takes tty_lock() to serialize proc_set_tty() for this tty * Takes tasklist_lock internally to walk sessions * Takes ->siglock() when updating signal->tty */ static int tiocsctty(struct tty_struct *tty, struct file *file, int arg) { int ret = 0; tty_lock(tty); read_lock(&tasklist_lock); if (current->signal->leader && task_session(current) == tty->ctrl.session) goto unlock; /* * The process must be a session leader and * not have a controlling tty already. */ if (!current->signal->leader || current->signal->tty) { ret = -EPERM; goto unlock; } if (tty->ctrl.session) { /* * This tty is already the controlling * tty for another session group! */ if (arg == 1 && capable(CAP_SYS_ADMIN)) { /* * Steal it away */ session_clear_tty(tty->ctrl.session); } else { ret = -EPERM; goto unlock; } } /* See the comment in tty_open_proc_set_tty(). */ if ((file->f_mode & FMODE_READ) == 0 && !capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto unlock; } proc_set_tty(tty); unlock: read_unlock(&tasklist_lock); tty_unlock(tty); return ret; } /** * tty_get_pgrp - return a ref counted pgrp pid * @tty: tty to read * * Returns a refcounted instance of the pid struct for the process * group controlling the tty. */ struct pid *tty_get_pgrp(struct tty_struct *tty) { unsigned long flags; struct pid *pgrp; spin_lock_irqsave(&tty->ctrl.lock, flags); pgrp = get_pid(tty->ctrl.pgrp); spin_unlock_irqrestore(&tty->ctrl.lock, flags); return pgrp; } EXPORT_SYMBOL_GPL(tty_get_pgrp); /* * This checks not only the pgrp, but falls back on the pid if no * satisfactory pgrp is found. I dunno - gdb doesn't work correctly * without this... * * The caller must hold rcu lock or the tasklist lock. */ static struct pid *session_of_pgrp(struct pid *pgrp) { struct task_struct *p; struct pid *sid = NULL; p = pid_task(pgrp, PIDTYPE_PGID); if (p == NULL) p = pid_task(pgrp, PIDTYPE_PID); if (p != NULL) sid = task_session(p); return sid; } /** * tiocgpgrp - get process group * @tty: tty passed by user * @real_tty: tty side of the tty passed by the user if a pty else the tty * @p: returned pid * * Obtain the process group of the tty. If there is no process group * return an error. * * Locking: none. Reference to current->signal->tty is safe. */ static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) { struct pid *pid; int ret; /* * (tty == real_tty) is a cheap way of * testing if the tty is NOT a master pty. */ if (tty == real_tty && current->signal->tty != real_tty) return -ENOTTY; pid = tty_get_pgrp(real_tty); ret = put_user(pid_vnr(pid), p); put_pid(pid); return ret; } /** * tiocspgrp - attempt to set process group * @tty: tty passed by user * @real_tty: tty side device matching tty passed by user * @p: pid pointer * * Set the process group of the tty to the session passed. Only * permitted where the tty session is our session. * * Locking: RCU, ctrl lock */ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) { struct pid *pgrp; pid_t pgrp_nr; int retval = tty_check_change(real_tty); if (retval == -EIO) return -ENOTTY; if (retval) return retval; if (get_user(pgrp_nr, p)) return -EFAULT; if (pgrp_nr < 0) return -EINVAL; spin_lock_irq(&real_tty->ctrl.lock); if (!current->signal->tty || (current->signal->tty != real_tty) || (real_tty->ctrl.session != task_session(current))) { retval = -ENOTTY; goto out_unlock_ctrl; } rcu_read_lock(); pgrp = find_vpid(pgrp_nr); retval = -ESRCH; if (!pgrp) goto out_unlock; retval = -EPERM; if (session_of_pgrp(pgrp) != task_session(current)) goto out_unlock; retval = 0; put_pid(real_tty->ctrl.pgrp); real_tty->ctrl.pgrp = get_pid(pgrp); out_unlock: rcu_read_unlock(); out_unlock_ctrl: spin_unlock_irq(&real_tty->ctrl.lock); return retval; } /** * tiocgsid - get session id * @tty: tty passed by user * @real_tty: tty side of the tty passed by the user if a pty else the tty * @p: pointer to returned session id * * Obtain the session id of the tty. If there is no session * return an error. */ static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) { unsigned long flags; pid_t sid; /* * (tty == real_tty) is a cheap way of * testing if the tty is NOT a master pty. */ if (tty == real_tty && current->signal->tty != real_tty) return -ENOTTY; spin_lock_irqsave(&real_tty->ctrl.lock, flags); if (!real_tty->ctrl.session) goto err; sid = pid_vnr(real_tty->ctrl.session); spin_unlock_irqrestore(&real_tty->ctrl.lock, flags); return put_user(sid, p); err: spin_unlock_irqrestore(&real_tty->ctrl.lock, flags); return -ENOTTY; } /* * Called from tty_ioctl(). If tty is a pty then real_tty is the slave side, * if not then tty == real_tty. */ long tty_jobctrl_ioctl(struct tty_struct *tty, struct tty_struct *real_tty, struct file *file, unsigned int cmd, unsigned long arg) { void __user *p = (void __user *)arg; switch (cmd) { case TIOCNOTTY: if (current->signal->tty != tty) return -ENOTTY; no_tty(); return 0; case TIOCSCTTY: return tiocsctty(real_tty, file, arg); case TIOCGPGRP: return tiocgpgrp(tty, real_tty, p); case TIOCSPGRP: return tiocspgrp(tty, real_tty, p); case TIOCGSID: return tiocgsid(tty, real_tty, p); } return -ENOIOCTLCMD; }
817 248 439 437 248 23 22 3 23 3 3 2 467 20 41 4 21 436 4 20 21 7 7 19 14 20 21 49 437 188 188 459 458 43 42 43 656 438 16 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 // SPDX-License-Identifier: GPL-2.0-only /* tnum: tracked (or tristate) numbers * * A tnum tracks knowledge about the bits of a value. Each bit can be either * known (0 or 1), or unknown (x). Arithmetic operations on tnums will * propagate the unknown bits such that the tnum result represents all the * possible results for possible values of the operands. */ #include <linux/kernel.h> #include <linux/tnum.h> #define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m} /* A completely unknown value */ const struct tnum tnum_unknown = { .value = 0, .mask = -1 }; struct tnum tnum_const(u64 value) { return TNUM(value, 0); } struct tnum tnum_range(u64 min, u64 max) { u64 chi = min ^ max, delta; u8 bits = fls64(chi); /* special case, needed because 1ULL << 64 is undefined */ if (bits > 63) return tnum_unknown; /* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7. * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return * constant min (since min == max). */ delta = (1ULL << bits) - 1; return TNUM(min & ~delta, delta); } struct tnum tnum_lshift(struct tnum a, u8 shift) { return TNUM(a.value << shift, a.mask << shift); } struct tnum tnum_rshift(struct tnum a, u8 shift) { return TNUM(a.value >> shift, a.mask >> shift); } struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness) { /* if a.value is negative, arithmetic shifting by minimum shift * will have larger negative offset compared to more shifting. * If a.value is nonnegative, arithmetic shifting by minimum shift * will have larger positive offset compare to more shifting. */ if (insn_bitness == 32) return TNUM((u32)(((s32)a.value) >> min_shift), (u32)(((s32)a.mask) >> min_shift)); else return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift); } struct tnum tnum_add(struct tnum a, struct tnum b) { u64 sm, sv, sigma, chi, mu; sm = a.mask + b.mask; sv = a.value + b.value; sigma = sm + sv; chi = sigma ^ sv; mu = chi | a.mask | b.mask; return TNUM(sv & ~mu, mu); } struct tnum tnum_sub(struct tnum a, struct tnum b) { u64 dv, alpha, beta, chi, mu; dv = a.value - b.value; alpha = dv + a.mask; beta = dv - b.mask; chi = alpha ^ beta; mu = chi | a.mask | b.mask; return TNUM(dv & ~mu, mu); } struct tnum tnum_neg(struct tnum a) { return tnum_sub(TNUM(0, 0), a); } struct tnum tnum_and(struct tnum a, struct tnum b) { u64 alpha, beta, v; alpha = a.value | a.mask; beta = b.value | b.mask; v = a.value & b.value; return TNUM(v, alpha & beta & ~v); } struct tnum tnum_or(struct tnum a, struct tnum b) { u64 v, mu; v = a.value | b.value; mu = a.mask | b.mask; return TNUM(v, mu & ~v); } struct tnum tnum_xor(struct tnum a, struct tnum b) { u64 v, mu; v = a.value ^ b.value; mu = a.mask | b.mask; return TNUM(v & ~mu, mu); } /* Perform long multiplication, iterating through the bits in a using rshift: * - if LSB(a) is a known 0, keep current accumulator * - if LSB(a) is a known 1, add b to current accumulator * - if LSB(a) is unknown, take a union of the above cases. * * For example: * * acc_0: acc_1: * * 11 * -> 11 * -> 11 * -> union(0011, 1001) == x0x1 * x1 01 11 * ------ ------ ------ * 11 11 11 * xx 00 11 * ------ ------ ------ * ???? 0011 1001 */ struct tnum tnum_mul(struct tnum a, struct tnum b) { struct tnum acc = TNUM(0, 0); while (a.value || a.mask) { /* LSB of tnum a is a certain 1 */ if (a.value & 1) acc = tnum_add(acc, b); /* LSB of tnum a is uncertain */ else if (a.mask & 1) { /* acc = tnum_union(acc_0, acc_1), where acc_0 and * acc_1 are partial accumulators for cases * LSB(a) = certain 0 and LSB(a) = certain 1. * acc_0 = acc + 0 * b = acc. * acc_1 = acc + 1 * b = tnum_add(acc, b). */ acc = tnum_union(acc, tnum_add(acc, b)); } /* Note: no case for LSB is certain 0 */ a = tnum_rshift(a, 1); b = tnum_lshift(b, 1); } return acc; } bool tnum_overlap(struct tnum a, struct tnum b) { u64 mu; mu = ~a.mask & ~b.mask; return (a.value & mu) == (b.value & mu); } /* Note that if a and b disagree - i.e. one has a 'known 1' where the other has * a 'known 0' - this will return a 'known 1' for that bit. */ struct tnum tnum_intersect(struct tnum a, struct tnum b) { u64 v, mu; v = a.value | b.value; mu = a.mask & b.mask; return TNUM(v & ~mu, mu); } /* Returns a tnum with the uncertainty from both a and b, and in addition, new * uncertainty at any position that a and b disagree. This represents a * superset of the union of the concrete sets of both a and b. Despite the * overapproximation, it is optimal. */ struct tnum tnum_union(struct tnum a, struct tnum b) { u64 v = a.value & b.value; u64 mu = (a.value ^ b.value) | a.mask | b.mask; return TNUM(v & ~mu, mu); } struct tnum tnum_cast(struct tnum a, u8 size) { a.value &= (1ULL << (size * 8)) - 1; a.mask &= (1ULL << (size * 8)) - 1; return a; } bool tnum_is_aligned(struct tnum a, u64 size) { if (!size) return true; return !((a.value | a.mask) & (size - 1)); } bool tnum_in(struct tnum a, struct tnum b) { if (b.mask & ~a.mask) return false; b.value &= ~a.mask; return a.value == b.value; } int tnum_sbin(char *str, size_t size, struct tnum a) { size_t n; for (n = 64; n; n--) { if (n < size) { if (a.mask & 1) str[n - 1] = 'x'; else if (a.value & 1) str[n - 1] = '1'; else str[n - 1] = '0'; } a.mask >>= 1; a.value >>= 1; } str[min(size - 1, (size_t)64)] = 0; return 64; } struct tnum tnum_subreg(struct tnum a) { return tnum_cast(a, 4); } struct tnum tnum_clear_subreg(struct tnum a) { return tnum_lshift(tnum_rshift(a, 32), 32); } struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg) { return tnum_or(tnum_clear_subreg(reg), tnum_subreg(subreg)); } struct tnum tnum_const_subreg(struct tnum a, u32 value) { return tnum_with_subreg(a, tnum_const(value)); }
2 2 2 2 3 2 2 2 2 2 2 2 2 2 1 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 // SPDX-License-Identifier: GPL-2.0 or MIT /* * Copyright 2018 Noralf Trønnes */ #include <linux/export.h> #include <linux/iosys-map.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <drm/drm_client.h> #include <drm/drm_client_event.h> #include <drm/drm_device.h> #include <drm/drm_drv.h> #include <drm/drm_file.h> #include <drm/drm_fourcc.h> #include <drm/drm_framebuffer.h> #include <drm/drm_gem.h> #include <drm/drm_gem_framebuffer_helper.h> #include <drm/drm_mode.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" #include "drm_internal.h" /** * DOC: overview * * This library provides support for clients running in the kernel like fbdev and bootsplash. * * GEM drivers which provide a GEM based dumb buffer with a virtual address are supported. */ static int drm_client_open(struct drm_client_dev *client) { struct drm_device *dev = client->dev; struct drm_file *file; file = drm_file_alloc(dev->primary); if (IS_ERR(file)) return PTR_ERR(file); mutex_lock(&dev->filelist_mutex); list_add(&file->lhead, &dev->filelist_internal); mutex_unlock(&dev->filelist_mutex); client->file = file; return 0; } static void drm_client_close(struct drm_client_dev *client) { struct drm_device *dev = client->dev; mutex_lock(&dev->filelist_mutex); list_del(&client->file->lhead); mutex_unlock(&dev->filelist_mutex); drm_file_free(client->file); } /** * drm_client_init - Initialise a DRM client * @dev: DRM device * @client: DRM client * @name: Client name * @funcs: DRM client functions (optional) * * This initialises the client and opens a &drm_file. * Use drm_client_register() to complete the process. * The caller needs to hold a reference on @dev before calling this function. * The client is freed when the &drm_device is unregistered. See drm_client_release(). * * Returns: * Zero on success or negative error code on failure. */ int drm_client_init(struct drm_device *dev, struct drm_client_dev *client, const char *name, const struct drm_client_funcs *funcs) { int ret; if (!drm_core_check_feature(dev, DRIVER_MODESET) || !dev->driver->dumb_create) return -EOPNOTSUPP; client->dev = dev; client->name = name; client->funcs = funcs; ret = drm_client_modeset_create(client); if (ret) return ret; ret = drm_client_open(client); if (ret) goto err_free; drm_dev_get(dev); return 0; err_free: drm_client_modeset_free(client); return ret; } EXPORT_SYMBOL(drm_client_init); /** * drm_client_register - Register client * @client: DRM client * * Add the client to the &drm_device client list to activate its callbacks. * @client must be initialized by a call to drm_client_init(). After * drm_client_register() it is no longer permissible to call drm_client_release() * directly (outside the unregister callback), instead cleanup will happen * automatically on driver unload. * * Registering a client generates a hotplug event that allows the client * to set up its display from pre-existing outputs. The client must have * initialized its state to able to handle the hotplug event successfully. */ void drm_client_register(struct drm_client_dev *client) { struct drm_device *dev = client->dev; int ret; mutex_lock(&dev->clientlist_mutex); list_add(&client->list, &dev->clientlist); if (client->funcs && client->funcs->hotplug) { /* * Perform an initial hotplug event to pick up the * display configuration for the client. This step * has to be performed *after* registering the client * in the list of clients, or a concurrent hotplug * event might be lost; leaving the display off. * * Hold the clientlist_mutex as for a regular hotplug * event. */ ret = client->funcs->hotplug(client); if (ret) drm_dbg_kms(dev, "client hotplug ret=%d\n", ret); } mutex_unlock(&dev->clientlist_mutex); } EXPORT_SYMBOL(drm_client_register); /** * drm_client_release - Release DRM client resources * @client: DRM client * * Releases resources by closing the &drm_file that was opened by drm_client_init(). * It is called automatically if the &drm_client_funcs.unregister callback is _not_ set. * * This function should only be called from the unregister callback. An exception * is fbdev which cannot free the buffer if userspace has open file descriptors. * * Note: * Clients cannot initiate a release by themselves. This is done to keep the code simple. * The driver has to be unloaded before the client can be unloaded. */ void drm_client_release(struct drm_client_dev *client) { struct drm_device *dev = client->dev; drm_dbg_kms(dev, "%s\n", client->name); drm_client_modeset_free(client); drm_client_close(client); if (client->funcs && client->funcs->free) client->funcs->free(client); drm_dev_put(dev); } EXPORT_SYMBOL(drm_client_release); /** * drm_client_buffer_delete - Delete a client buffer * @buffer: DRM client buffer */ void drm_client_buffer_delete(struct drm_client_buffer *buffer) { struct drm_gem_object *gem; int ret; if (!buffer) return; gem = buffer->fb->obj[0]; drm_gem_vunmap(gem, &buffer->map); ret = drm_mode_rmfb(buffer->client->dev, buffer->fb->base.id, buffer->client->file); if (ret) drm_err(buffer->client->dev, "Error removing FB:%u (%d)\n", buffer->fb->base.id, ret); drm_gem_object_put(buffer->gem); kfree(buffer); } EXPORT_SYMBOL(drm_client_buffer_delete); static struct drm_client_buffer * drm_client_buffer_create(struct drm_client_dev *client, u32 width, u32 height, u32 format, u32 handle, u32 pitch) { struct drm_mode_fb_cmd2 fb_req = { .width = width, .height = height, .pixel_format = format, .handles = { handle, }, .pitches = { pitch, }, }; struct drm_device *dev = client->dev; struct drm_client_buffer *buffer; struct drm_gem_object *obj; struct drm_framebuffer *fb; int ret; buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); if (!buffer) return ERR_PTR(-ENOMEM); buffer->client = client; obj = drm_gem_object_lookup(client->file, handle); if (!obj) { ret = -ENOENT; goto err_delete; } ret = drm_mode_addfb2(dev, &fb_req, client->file); if (ret) goto err_drm_gem_object_put; fb = drm_framebuffer_lookup(dev, client->file, fb_req.fb_id); if (drm_WARN_ON(dev, !fb)) { ret = -ENOENT; goto err_drm_mode_rmfb; } /* drop the reference we picked up in framebuffer lookup */ drm_framebuffer_put(fb); strscpy(fb->comm, client->name, TASK_COMM_LEN); buffer->gem = obj; buffer->fb = fb; return buffer; err_drm_mode_rmfb: drm_mode_rmfb(dev, fb_req.fb_id, client->file); err_drm_gem_object_put: drm_gem_object_put(obj); err_delete: kfree(buffer); return ERR_PTR(ret); } /** * drm_client_buffer_vmap_local - Map DRM client buffer into address space * @buffer: DRM client buffer * @map_copy: Returns the mapped memory's address * * This function maps a client buffer into kernel address space. If the * buffer is already mapped, it returns the existing mapping's address. * * Client buffer mappings are not ref'counted. Each call to * drm_client_buffer_vmap_local() should be closely followed by a call to * drm_client_buffer_vunmap_local(). See drm_client_buffer_vmap() for * long-term mappings. * * The returned address is a copy of the internal value. In contrast to * other vmap interfaces, you don't need it for the client's vunmap * function. So you can modify it at will during blit and draw operations. * * Returns: * 0 on success, or a negative errno code otherwise. */ int drm_client_buffer_vmap_local(struct drm_client_buffer *buffer, struct iosys_map *map_copy) { struct drm_gem_object *gem = buffer->fb->obj[0]; struct iosys_map *map = &buffer->map; int ret; drm_gem_lock(gem); ret = drm_gem_vmap_locked(gem, map); if (ret) goto err_drm_gem_vmap_unlocked; *map_copy = *map; return 0; err_drm_gem_vmap_unlocked: drm_gem_unlock(gem); return ret; } EXPORT_SYMBOL(drm_client_buffer_vmap_local); /** * drm_client_buffer_vunmap_local - Unmap DRM client buffer * @buffer: DRM client buffer * * This function removes a client buffer's memory mapping established * with drm_client_buffer_vunmap_local(). Calling this function is only * required by clients that manage their buffer mappings by themselves. */ void drm_client_buffer_vunmap_local(struct drm_client_buffer *buffer) { struct drm_gem_object *gem = buffer->fb->obj[0]; struct iosys_map *map = &buffer->map; drm_gem_vunmap_locked(gem, map); drm_gem_unlock(gem); } EXPORT_SYMBOL(drm_client_buffer_vunmap_local); /** * drm_client_buffer_vmap - Map DRM client buffer into address space * @buffer: DRM client buffer * @map_copy: Returns the mapped memory's address * * This function maps a client buffer into kernel address space. If the * buffer is already mapped, it returns the existing mapping's address. * * Client buffer mappings are not ref'counted. Each call to * drm_client_buffer_vmap() should be followed by a call to * drm_client_buffer_vunmap(); or the client buffer should be mapped * throughout its lifetime. * * The returned address is a copy of the internal value. In contrast to * other vmap interfaces, you don't need it for the client's vunmap * function. So you can modify it at will during blit and draw operations. * * Returns: * 0 on success, or a negative errno code otherwise. */ int drm_client_buffer_vmap(struct drm_client_buffer *buffer, struct iosys_map *map_copy) { struct drm_gem_object *gem = buffer->fb->obj[0]; int ret; ret = drm_gem_vmap(gem, &buffer->map); if (ret) return ret; *map_copy = buffer->map; return 0; } EXPORT_SYMBOL(drm_client_buffer_vmap); /** * drm_client_buffer_vunmap - Unmap DRM client buffer * @buffer: DRM client buffer * * This function removes a client buffer's memory mapping. Calling this * function is only required by clients that manage their buffer mappings * by themselves. */ void drm_client_buffer_vunmap(struct drm_client_buffer *buffer) { struct drm_gem_object *gem = buffer->fb->obj[0]; drm_gem_vunmap(gem, &buffer->map); } EXPORT_SYMBOL(drm_client_buffer_vunmap); /** * drm_client_buffer_create_dumb - Create a client buffer backed by a dumb buffer * @client: DRM client * @width: Framebuffer width * @height: Framebuffer height * @format: Buffer format * * This function creates a &drm_client_buffer which consists of a * &drm_framebuffer backed by a dumb buffer. * Call drm_client_buffer_delete() to free the buffer. * * Returns: * Pointer to a client buffer or an error pointer on failure. */ struct drm_client_buffer * drm_client_buffer_create_dumb(struct drm_client_dev *client, u32 width, u32 height, u32 format) { const struct drm_format_info *info = drm_format_info(format); struct drm_device *dev = client->dev; struct drm_mode_create_dumb dumb_args = { }; struct drm_client_buffer *buffer; int ret; dumb_args.width = width; dumb_args.height = height; dumb_args.bpp = drm_format_info_bpp(info, 0); ret = drm_mode_create_dumb(dev, &dumb_args, client->file); if (ret) return ERR_PTR(ret); buffer = drm_client_buffer_create(client, width, height, format, dumb_args.handle, dumb_args.pitch); if (IS_ERR(buffer)) { ret = PTR_ERR(buffer); goto err_drm_mode_destroy_dumb; } /* * The handle is only needed for creating the framebuffer, destroy it * again to solve a circular dependency should anybody export the GEM * object as DMA-buf. The framebuffer and our buffer structure are still * holding references to the GEM object to prevent its destruction. */ drm_mode_destroy_dumb(client->dev, dumb_args.handle, client->file); return buffer; err_drm_mode_destroy_dumb: drm_mode_destroy_dumb(client->dev, dumb_args.handle, client->file); return ERR_PTR(ret); } EXPORT_SYMBOL(drm_client_buffer_create_dumb); /** * drm_client_buffer_flush - Manually flush client buffer * @buffer: DRM client buffer * @rect: Damage rectangle (if NULL flushes all) * * This calls &drm_framebuffer_funcs->dirty (if present) to flush buffer changes * for drivers that need it. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_buffer_flush(struct drm_client_buffer *buffer, struct drm_rect *rect) { if (!buffer || !buffer->fb || !buffer->fb->funcs->dirty) return 0; if (rect) { struct drm_clip_rect clip = { .x1 = rect->x1, .y1 = rect->y1, .x2 = rect->x2, .y2 = rect->y2, }; return buffer->fb->funcs->dirty(buffer->fb, buffer->client->file, 0, 0, &clip, 1); } return buffer->fb->funcs->dirty(buffer->fb, buffer->client->file, 0, 0, NULL, 0); } EXPORT_SYMBOL(drm_client_buffer_flush);
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Count leading and trailing zeros functions * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_BITOPS_COUNT_ZEROS_H_ #define _LINUX_BITOPS_COUNT_ZEROS_H_ #include <asm/bitops.h> /** * count_leading_zeros - Count the number of zeros from the MSB back * @x: The value * * Count the number of leading zeros from the MSB going towards the LSB in @x. * * If the MSB of @x is set, the result is 0. * If only the LSB of @x is set, then the result is BITS_PER_LONG-1. * If @x is 0 then the result is COUNT_LEADING_ZEROS_0. */ static inline int count_leading_zeros(unsigned long x) { if (sizeof(x) == 4) return BITS_PER_LONG - fls(x); else return BITS_PER_LONG - fls64(x); } #define COUNT_LEADING_ZEROS_0 BITS_PER_LONG /** * count_trailing_zeros - Count the number of zeros from the LSB forwards * @x: The value * * Count the number of trailing zeros from the LSB going towards the MSB in @x. * * If the LSB of @x is set, the result is 0. * If only the MSB of @x is set, then the result is BITS_PER_LONG-1. * If @x is 0 then the result is COUNT_TRAILING_ZEROS_0. */ static inline int count_trailing_zeros(unsigned long x) { #define COUNT_TRAILING_ZEROS_0 (-1) if (sizeof(x) == 4) return ffs(x); else return (x != 0) ? __ffs(x) : COUNT_TRAILING_ZEROS_0; } #endif /* _LINUX_BITOPS_COUNT_ZEROS_H_ */
15 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 /* * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #ifndef __DRM_PLANE_H__ #define __DRM_PLANE_H__ #include <linux/list.h> #include <linux/ctype.h> #include <linux/kmsg_dump.h> #include <drm/drm_mode_object.h> #include <drm/drm_color_mgmt.h> #include <drm/drm_rect.h> #include <drm/drm_modeset_lock.h> #include <drm/drm_util.h> struct drm_crtc; struct drm_plane_size_hint; struct drm_printer; struct drm_modeset_acquire_ctx; enum drm_scaling_filter { DRM_SCALING_FILTER_DEFAULT, DRM_SCALING_FILTER_NEAREST_NEIGHBOR, }; /** * struct drm_plane_state - mutable plane state * * Please note that the destination coordinates @crtc_x, @crtc_y, @crtc_h and * @crtc_w and the source coordinates @src_x, @src_y, @src_h and @src_w are the * raw coordinates provided by userspace. Drivers should use * drm_atomic_helper_check_plane_state() and only use the derived rectangles in * @src and @dst to program the hardware. */ struct drm_plane_state { /** @plane: backpointer to the plane */ struct drm_plane *plane; /** * @crtc: * * Currently bound CRTC, NULL if disabled. Do not write this directly, * use drm_atomic_set_crtc_for_plane() */ struct drm_crtc *crtc; /** * @fb: * * Currently bound framebuffer. Do not write this directly, use * drm_atomic_set_fb_for_plane() */ struct drm_framebuffer *fb; /** * @fence: * * Optional fence to wait for before scanning out @fb. The core atomic * code will set this when userspace is using explicit fencing. Do not * write this field directly for a driver's implicit fence. * * Drivers should store any implicit fence in this from their * &drm_plane_helper_funcs.prepare_fb callback. See * drm_gem_plane_helper_prepare_fb() for a suitable helper. */ struct dma_fence *fence; /** * @crtc_x: * * Left position of visible portion of plane on crtc, signed dest * location allows it to be partially off screen. */ int32_t crtc_x; /** * @crtc_y: * * Upper position of visible portion of plane on crtc, signed dest * location allows it to be partially off screen. */ int32_t crtc_y; /** @crtc_w: width of visible portion of plane on crtc */ /** @crtc_h: height of visible portion of plane on crtc */ uint32_t crtc_w, crtc_h; /** * @src_x: left position of visible portion of plane within plane (in * 16.16 fixed point). */ uint32_t src_x; /** * @src_y: upper position of visible portion of plane within plane (in * 16.16 fixed point). */ uint32_t src_y; /** @src_w: width of visible portion of plane (in 16.16) */ /** @src_h: height of visible portion of plane (in 16.16) */ uint32_t src_h, src_w; /** @hotspot_x: x offset to mouse cursor hotspot */ /** @hotspot_y: y offset to mouse cursor hotspot */ int32_t hotspot_x, hotspot_y; /** * @alpha: * Opacity of the plane with 0 as completely transparent and 0xffff as * completely opaque. See drm_plane_create_alpha_property() for more * details. */ u16 alpha; /** * @pixel_blend_mode: * The alpha blending equation selection, describing how the pixels from * the current plane are composited with the background. Value can be * one of DRM_MODE_BLEND_* */ uint16_t pixel_blend_mode; /** * @rotation: * Rotation of the plane. See drm_plane_create_rotation_property() for * more details. */ unsigned int rotation; /** * @zpos: * Priority of the given plane on crtc (optional). * * User-space may set mutable zpos properties so that multiple active * planes on the same CRTC have identical zpos values. This is a * user-space bug, but drivers can solve the conflict by comparing the * plane object IDs; the plane with a higher ID is stacked on top of a * plane with a lower ID. * * See drm_plane_create_zpos_property() and * drm_plane_create_zpos_immutable_property() for more details. */ unsigned int zpos; /** * @normalized_zpos: * Normalized value of zpos: unique, range from 0 to N-1 where N is the * number of active planes for given crtc. Note that the driver must set * &drm_mode_config.normalize_zpos or call drm_atomic_normalize_zpos() to * update this before it can be trusted. */ unsigned int normalized_zpos; /** * @color_encoding: * * Color encoding for non RGB formats */ enum drm_color_encoding color_encoding; /** * @color_range: * * Color range for non RGB formats */ enum drm_color_range color_range; /** * @fb_damage_clips: * * Blob representing damage (area in plane framebuffer that changed * since last plane update) as an array of &drm_mode_rect in framebuffer * coodinates of the attached framebuffer. Note that unlike plane src, * damage clips are not in 16.16 fixed point. * * See drm_plane_get_damage_clips() and * drm_plane_get_damage_clips_count() for accessing these. */ struct drm_property_blob *fb_damage_clips; /** * @ignore_damage_clips: * * Set by drivers to indicate the drm_atomic_helper_damage_iter_init() * helper that the @fb_damage_clips blob property should be ignored. * * See :ref:`damage_tracking_properties` for more information. */ bool ignore_damage_clips; /** * @src: * * source coordinates of the plane (in 16.16). * * When using drm_atomic_helper_check_plane_state(), * the coordinates are clipped, but the driver may choose * to use unclipped coordinates instead when the hardware * performs the clipping automatically. */ /** * @dst: * * clipped destination coordinates of the plane. * * When using drm_atomic_helper_check_plane_state(), * the coordinates are clipped, but the driver may choose * to use unclipped coordinates instead when the hardware * performs the clipping automatically. */ struct drm_rect src, dst; /** * @visible: * * Visibility of the plane. This can be false even if fb!=NULL and * crtc!=NULL, due to clipping. */ bool visible; /** * @scaling_filter: * * Scaling filter to be applied */ enum drm_scaling_filter scaling_filter; /** * @color_pipeline: * * The first colorop of the active color pipeline, or NULL, if no * color pipeline is active. */ struct drm_colorop *color_pipeline; /** * @commit: Tracks the pending commit to prevent use-after-free conditions, * and for async plane updates. * * May be NULL. */ struct drm_crtc_commit *commit; /** @state: backpointer to global drm_atomic_state */ struct drm_atomic_state *state; /** * @color_mgmt_changed: Color management properties have changed. Used * by the atomic helpers and drivers to steer the atomic commit control * flow. */ bool color_mgmt_changed : 1; }; static inline struct drm_rect drm_plane_state_src(const struct drm_plane_state *state) { struct drm_rect src = { .x1 = state->src_x, .y1 = state->src_y, .x2 = state->src_x + state->src_w, .y2 = state->src_y + state->src_h, }; return src; } static inline struct drm_rect drm_plane_state_dest(const struct drm_plane_state *state) { struct drm_rect dest = { .x1 = state->crtc_x, .y1 = state->crtc_y, .x2 = state->crtc_x + state->crtc_w, .y2 = state->crtc_y + state->crtc_h, }; return dest; } /** * struct drm_plane_funcs - driver plane control functions */ struct drm_plane_funcs { /** * @update_plane: * * This is the legacy entry point to enable and configure the plane for * the given CRTC and framebuffer. It is never called to disable the * plane, i.e. the passed-in crtc and fb paramters are never NULL. * * The source rectangle in frame buffer memory coordinates is given by * the src_x, src_y, src_w and src_h parameters (as 16.16 fixed point * values). Devices that don't support subpixel plane coordinates can * ignore the fractional part. * * The destination rectangle in CRTC coordinates is given by the * crtc_x, crtc_y, crtc_w and crtc_h parameters (as integer values). * Devices scale the source rectangle to the destination rectangle. If * scaling is not supported, and the source rectangle size doesn't match * the destination rectangle size, the driver must return a * -<errorname>EINVAL</errorname> error. * * Drivers implementing atomic modeset should use * drm_atomic_helper_update_plane() to implement this hook. * * RETURNS: * * 0 on success or a negative error code on failure. */ int (*update_plane)(struct drm_plane *plane, struct drm_crtc *crtc, struct drm_framebuffer *fb, int crtc_x, int crtc_y, unsigned int crtc_w, unsigned int crtc_h, uint32_t src_x, uint32_t src_y, uint32_t src_w, uint32_t src_h, struct drm_modeset_acquire_ctx *ctx); /** * @disable_plane: * * This is the legacy entry point to disable the plane. The DRM core * calls this method in response to a DRM_IOCTL_MODE_SETPLANE IOCTL call * with the frame buffer ID set to 0. Disabled planes must not be * processed by the CRTC. * * Drivers implementing atomic modeset should use * drm_atomic_helper_disable_plane() to implement this hook. * * RETURNS: * * 0 on success or a negative error code on failure. */ int (*disable_plane)(struct drm_plane *plane, struct drm_modeset_acquire_ctx *ctx); /** * @destroy: * * Clean up plane resources. This is only called at driver unload time * through drm_mode_config_cleanup() since a plane cannot be hotplugged * in DRM. */ void (*destroy)(struct drm_plane *plane); /** * @reset: * * Reset plane hardware and software state to off. This function isn't * called by the core directly, only through drm_mode_config_reset(). * It's not a helper hook only for historical reasons. * * Atomic drivers can use drm_atomic_helper_plane_reset() to reset * atomic state using this hook. */ void (*reset)(struct drm_plane *plane); /** * @set_property: * * This is the legacy entry point to update a property attached to the * plane. * * This callback is optional if the driver does not support any legacy * driver-private properties. For atomic drivers it is not used because * property handling is done entirely in the DRM core. * * RETURNS: * * 0 on success or a negative error code on failure. */ int (*set_property)(struct drm_plane *plane, struct drm_property *property, uint64_t val); /** * @atomic_duplicate_state: * * Duplicate the current atomic state for this plane and return it. * The core and helpers guarantee that any atomic state duplicated with * this hook and still owned by the caller (i.e. not transferred to the * driver by calling &drm_mode_config_funcs.atomic_commit) will be * cleaned up by calling the @atomic_destroy_state hook in this * structure. * * This callback is mandatory for atomic drivers. * * Atomic drivers which don't subclass &struct drm_plane_state should use * drm_atomic_helper_plane_duplicate_state(). Drivers that subclass the * state structure to extend it with driver-private state should use * __drm_atomic_helper_plane_duplicate_state() to make sure shared state is * duplicated in a consistent fashion across drivers. * * It is an error to call this hook before &drm_plane.state has been * initialized correctly. * * NOTE: * * If the duplicate state references refcounted resources this hook must * acquire a reference for each of them. The driver must release these * references again in @atomic_destroy_state. * * RETURNS: * * Duplicated atomic state or NULL when the allocation failed. */ struct drm_plane_state *(*atomic_duplicate_state)(struct drm_plane *plane); /** * @atomic_destroy_state: * * Destroy a state duplicated with @atomic_duplicate_state and release * or unreference all resources it references * * This callback is mandatory for atomic drivers. */ void (*atomic_destroy_state)(struct drm_plane *plane, struct drm_plane_state *state); /** * @atomic_set_property: * * Decode a driver-private property value and store the decoded value * into the passed-in state structure. Since the atomic core decodes all * standardized properties (even for extensions beyond the core set of * properties which might not be implemented by all drivers) this * requires drivers to subclass the state structure. * * Such driver-private properties should really only be implemented for * truly hardware/vendor specific state. Instead it is preferred to * standardize atomic extension and decode the properties used to expose * such an extension in the core. * * Do not call this function directly, use * drm_atomic_plane_set_property() instead. * * This callback is optional if the driver does not support any * driver-private atomic properties. * * NOTE: * * This function is called in the state assembly phase of atomic * modesets, which can be aborted for any reason (including on * userspace's request to just check whether a configuration would be * possible). Drivers MUST NOT touch any persistent state (hardware or * software) or data structures except the passed in @state parameter. * * Also since userspace controls in which order properties are set this * function must not do any input validation (since the state update is * incomplete and hence likely inconsistent). Instead any such input * validation must be done in the various atomic_check callbacks. * * RETURNS: * * 0 if the property has been found, -EINVAL if the property isn't * implemented by the driver (which shouldn't ever happen, the core only * asks for properties attached to this plane). No other validation is * allowed by the driver. The core already checks that the property * value is within the range (integer, valid enum value, ...) the driver * set when registering the property. */ int (*atomic_set_property)(struct drm_plane *plane, struct drm_plane_state *state, struct drm_property *property, uint64_t val); /** * @atomic_get_property: * * Reads out the decoded driver-private property. This is used to * implement the GETPLANE IOCTL. * * Do not call this function directly, use * drm_atomic_plane_get_property() instead. * * This callback is optional if the driver does not support any * driver-private atomic properties. * * RETURNS: * * 0 on success, -EINVAL if the property isn't implemented by the * driver (which should never happen, the core only asks for * properties attached to this plane). */ int (*atomic_get_property)(struct drm_plane *plane, const struct drm_plane_state *state, struct drm_property *property, uint64_t *val); /** * @late_register: * * This optional hook can be used to register additional userspace * interfaces attached to the plane like debugfs interfaces. * It is called late in the driver load sequence from drm_dev_register(). * Everything added from this callback should be unregistered in * the early_unregister callback. * * Returns: * * 0 on success, or a negative error code on failure. */ int (*late_register)(struct drm_plane *plane); /** * @early_unregister: * * This optional hook should be used to unregister the additional * userspace interfaces attached to the plane from * @late_register. It is called from drm_dev_unregister(), * early in the driver unload sequence to disable userspace access * before data structures are torndown. */ void (*early_unregister)(struct drm_plane *plane); /** * @atomic_print_state: * * If driver subclasses &struct drm_plane_state, it should implement * this optional hook for printing additional driver specific state. * * Do not call this directly, use drm_atomic_plane_print_state() * instead. */ void (*atomic_print_state)(struct drm_printer *p, const struct drm_plane_state *state); /** * @format_mod_supported: * * This optional hook is used for the DRM to determine if the given * format/modifier combination is valid for the plane. This allows the * DRM to generate the correct format bitmask (which formats apply to * which modifier), and to validate modifiers at atomic_check time. * * If not present, then any modifier in the plane's modifier * list is allowed with any of the plane's formats. * * Returns: * * True if the given modifier is valid for that format on the plane. * False otherwise. */ bool (*format_mod_supported)(struct drm_plane *plane, uint32_t format, uint64_t modifier); /** * @format_mod_supported_async: * * This optional hook is used for the DRM to determine if for * asynchronous flip the given format/modifier combination is valid for * the plane. This allows the DRM to generate the correct format * bitmask (which formats apply to which modifier), and to validate * modifiers at atomic_check time. * * Returns: * * True if the given modifier is valid for that format on the plane. * False otherwise. */ bool (*format_mod_supported_async)(struct drm_plane *plane, u32 format, u64 modifier); }; /** * enum drm_plane_type - uapi plane type enumeration * * For historical reasons not all planes are made the same. This enumeration is * used to tell the different types of planes apart to implement the different * uapi semantics for them. For userspace which is universal plane aware and * which is using that atomic IOCTL there's no difference between these planes * (beyong what the driver and hardware can support of course). * * For compatibility with legacy userspace, only overlay planes are made * available to userspace by default. Userspace clients may set the * &DRM_CLIENT_CAP_UNIVERSAL_PLANES client capability bit to indicate that they * wish to receive a universal plane list containing all plane types. See also * drm_for_each_legacy_plane(). * * In addition to setting each plane's type, drivers need to setup the * &drm_crtc.primary and optionally &drm_crtc.cursor pointers for legacy * IOCTLs. See drm_crtc_init_with_planes(). * * WARNING: The values of this enum is UABI since they're exposed in the "type" * property. */ enum drm_plane_type { /** * @DRM_PLANE_TYPE_OVERLAY: * * Overlay planes represent all non-primary, non-cursor planes. Some * drivers refer to these types of planes as "sprites" internally. */ DRM_PLANE_TYPE_OVERLAY, /** * @DRM_PLANE_TYPE_PRIMARY: * * A primary plane attached to a CRTC is the most likely to be able to * light up the CRTC when no scaling/cropping is used and the plane * covers the whole CRTC. */ DRM_PLANE_TYPE_PRIMARY, /** * @DRM_PLANE_TYPE_CURSOR: * * A cursor plane attached to a CRTC is more likely to be able to be * enabled when no scaling/cropping is used and the framebuffer has the * size indicated by &drm_mode_config.cursor_width and * &drm_mode_config.cursor_height. Additionally, if the driver doesn't * support modifiers, the framebuffer should have a linear layout. */ DRM_PLANE_TYPE_CURSOR, }; /** * struct drm_plane - central DRM plane control structure * * Planes represent the scanout hardware of a display block. They receive their * input data from a &drm_framebuffer and feed it to a &drm_crtc. Planes control * the color conversion, see `Plane Composition Properties`_ for more details, * and are also involved in the color conversion of input pixels, see `Color * Management Properties`_ for details on that. */ struct drm_plane { /** @dev: DRM device this plane belongs to */ struct drm_device *dev; /** * @head: * * List of all planes on @dev, linked from &drm_mode_config.plane_list. * Invariant over the lifetime of @dev and therefore does not need * locking. */ struct list_head head; /** @name: human readable name, can be overwritten by the driver */ char *name; /** * @mutex: * * Protects modeset plane state, together with the &drm_crtc.mutex of * CRTC this plane is linked to (when active, getting activated or * getting disabled). * * For atomic drivers specifically this protects @state. */ struct drm_modeset_lock mutex; /** @base: base mode object */ struct drm_mode_object base; /** * @possible_crtcs: pipes this plane can be bound to constructed from * drm_crtc_mask() */ uint32_t possible_crtcs; /** @format_types: array of formats supported by this plane */ uint32_t *format_types; /** @format_count: Size of the array pointed at by @format_types. */ unsigned int format_count; /** * @format_default: driver hasn't supplied supported formats for the * plane. Used by the non-atomic driver compatibility wrapper only. */ bool format_default; /** @modifiers: array of modifiers supported by this plane */ uint64_t *modifiers; /** @modifier_count: Size of the array pointed at by @modifier_count. */ unsigned int modifier_count; /** * @crtc: * * Currently bound CRTC, only meaningful for non-atomic drivers. For * atomic drivers this is forced to be NULL, atomic drivers should * instead check &drm_plane_state.crtc. */ struct drm_crtc *crtc; /** * @fb: * * Currently bound framebuffer, only meaningful for non-atomic drivers. * For atomic drivers this is forced to be NULL, atomic drivers should * instead check &drm_plane_state.fb. */ struct drm_framebuffer *fb; /** * @old_fb: * * Temporary tracking of the old fb while a modeset is ongoing. Only * used by non-atomic drivers, forced to be NULL for atomic drivers. */ struct drm_framebuffer *old_fb; /** @funcs: plane control functions */ const struct drm_plane_funcs *funcs; /** @properties: property tracking for this plane */ struct drm_object_properties properties; /** @type: Type of plane, see &enum drm_plane_type for details. */ enum drm_plane_type type; /** * @index: Position inside the mode_config.list, can be used as an array * index. It is invariant over the lifetime of the plane. */ unsigned index; /** @helper_private: mid-layer private data */ const struct drm_plane_helper_funcs *helper_private; /** * @state: * * Current atomic state for this plane. * * This is protected by @mutex. Note that nonblocking atomic commits * access the current plane state without taking locks. Either by going * through the &struct drm_atomic_state pointers, see * for_each_oldnew_plane_in_state(), for_each_old_plane_in_state() and * for_each_new_plane_in_state(). Or through careful ordering of atomic * commit operations as implemented in the atomic helpers, see * &struct drm_crtc_commit. */ struct drm_plane_state *state; /** * @alpha_property: * Optional alpha property for this plane. See * drm_plane_create_alpha_property(). */ struct drm_property *alpha_property; /** * @zpos_property: * Optional zpos property for this plane. See * drm_plane_create_zpos_property(). */ struct drm_property *zpos_property; /** * @rotation_property: * Optional rotation property for this plane. See * drm_plane_create_rotation_property(). */ struct drm_property *rotation_property; /** * @blend_mode_property: * Optional "pixel blend mode" enum property for this plane. * Blend mode property represents the alpha blending equation selection, * describing how the pixels from the current plane are composited with * the background. */ struct drm_property *blend_mode_property; /** * @color_encoding_property: * * Optional "COLOR_ENCODING" enum property for specifying * color encoding for non RGB formats. * See drm_plane_create_color_properties(). */ struct drm_property *color_encoding_property; /** * @color_range_property: * * Optional "COLOR_RANGE" enum property for specifying * color range for non RGB formats. * See drm_plane_create_color_properties(). */ struct drm_property *color_range_property; /** * @color_pipeline_property: * * Optional "COLOR_PIPELINE" enum property for specifying * a color pipeline to use on the plane. */ struct drm_property *color_pipeline_property; /** * @scaling_filter_property: property to apply a particular filter while * scaling. */ struct drm_property *scaling_filter_property; /** * @hotspot_x_property: property to set mouse hotspot x offset. */ struct drm_property *hotspot_x_property; /** * @hotspot_y_property: property to set mouse hotspot y offset. */ struct drm_property *hotspot_y_property; /** * @kmsg_panic: Used to register a panic notifier for this plane */ struct kmsg_dumper kmsg_panic; }; #define obj_to_plane(x) container_of(x, struct drm_plane, base) __printf(9, 10) int drm_universal_plane_init(struct drm_device *dev, struct drm_plane *plane, uint32_t possible_crtcs, const struct drm_plane_funcs *funcs, const uint32_t *formats, unsigned int format_count, const uint64_t *format_modifiers, enum drm_plane_type type, const char *name, ...); void drm_plane_cleanup(struct drm_plane *plane); __printf(10, 11) void *__drmm_universal_plane_alloc(struct drm_device *dev, size_t size, size_t offset, uint32_t possible_crtcs, const struct drm_plane_funcs *funcs, const uint32_t *formats, unsigned int format_count, const uint64_t *format_modifiers, enum drm_plane_type plane_type, const char *name, ...); /** * drmm_universal_plane_alloc - Allocate and initialize an universal plane object * @dev: DRM device * @type: the type of the struct which contains struct &drm_plane * @member: the name of the &drm_plane within @type * @possible_crtcs: bitmask of possible CRTCs * @funcs: callbacks for the new plane * @formats: array of supported formats (DRM_FORMAT\_\*) * @format_count: number of elements in @formats * @format_modifiers: array of struct drm_format modifiers terminated by * DRM_FORMAT_MOD_INVALID * @plane_type: type of plane (overlay, primary, cursor) * @name: printf style format string for the plane name, or NULL for default name * * Allocates and initializes a plane object of type @type. Cleanup is * automatically handled through registering drm_plane_cleanup() with * drmm_add_action(). * * The @drm_plane_funcs.destroy hook must be NULL. * * Drivers that only support the DRM_FORMAT_MOD_LINEAR modifier support may set * @format_modifiers to NULL. The plane will advertise the linear modifier. * * Returns: * Pointer to new plane, or ERR_PTR on failure. */ #define drmm_universal_plane_alloc(dev, type, member, possible_crtcs, funcs, formats, \ format_count, format_modifiers, plane_type, name, ...) \ ((type *)__drmm_universal_plane_alloc(dev, sizeof(type), \ offsetof(type, member), \ possible_crtcs, funcs, formats, \ format_count, format_modifiers, \ plane_type, name, ##__VA_ARGS__)) __printf(10, 11) void *__drm_universal_plane_alloc(struct drm_device *dev, size_t size, size_t offset, uint32_t possible_crtcs, const struct drm_plane_funcs *funcs, const uint32_t *formats, unsigned int format_count, const uint64_t *format_modifiers, enum drm_plane_type plane_type, const char *name, ...); /** * drm_universal_plane_alloc() - Allocate and initialize an universal plane object * @dev: DRM device * @type: the type of the struct which contains struct &drm_plane * @member: the name of the &drm_plane within @type * @possible_crtcs: bitmask of possible CRTCs * @funcs: callbacks for the new plane * @formats: array of supported formats (DRM_FORMAT\_\*) * @format_count: number of elements in @formats * @format_modifiers: array of struct drm_format modifiers terminated by * DRM_FORMAT_MOD_INVALID * @plane_type: type of plane (overlay, primary, cursor) * @name: printf style format string for the plane name, or NULL for default name * * Allocates and initializes a plane object of type @type. The caller * is responsible for releasing the allocated memory with kfree(). * * Drivers are encouraged to use drmm_universal_plane_alloc() instead. * * Drivers that only support the DRM_FORMAT_MOD_LINEAR modifier support may set * @format_modifiers to NULL. The plane will advertise the linear modifier. * * Returns: * Pointer to new plane, or ERR_PTR on failure. */ #define drm_universal_plane_alloc(dev, type, member, possible_crtcs, funcs, formats, \ format_count, format_modifiers, plane_type, name, ...) \ ((type *)__drm_universal_plane_alloc(dev, sizeof(type), \ offsetof(type, member), \ possible_crtcs, funcs, formats, \ format_count, format_modifiers, \ plane_type, name, ##__VA_ARGS__)) /** * drm_plane_index - find the index of a registered plane * @plane: plane to find index for * * Given a registered plane, return the index of that plane within a DRM * device's list of planes. */ static inline unsigned int drm_plane_index(const struct drm_plane *plane) { return plane->index; } /** * drm_plane_mask - find the mask of a registered plane * @plane: plane to find mask for */ static inline u32 drm_plane_mask(const struct drm_plane *plane) { return 1 << drm_plane_index(plane); } struct drm_plane * drm_plane_from_index(struct drm_device *dev, int idx); void drm_plane_force_disable(struct drm_plane *plane); int drm_mode_plane_set_obj_prop(struct drm_plane *plane, struct drm_property *property, uint64_t value); /** * drm_plane_find - find a &drm_plane * @dev: DRM device * @file_priv: drm file to check for lease against. * @id: plane id * * Returns the plane with @id, NULL if it doesn't exist. Simple wrapper around * drm_mode_object_find(). */ static inline struct drm_plane *drm_plane_find(struct drm_device *dev, struct drm_file *file_priv, uint32_t id) { struct drm_mode_object *mo; mo = drm_mode_object_find(dev, file_priv, id, DRM_MODE_OBJECT_PLANE); return mo ? obj_to_plane(mo) : NULL; } /** * drm_for_each_plane_mask - iterate over planes specified by bitmask * @plane: the loop cursor * @dev: the DRM device * @plane_mask: bitmask of plane indices * * Iterate over all planes specified by bitmask. */ #define drm_for_each_plane_mask(plane, dev, plane_mask) \ list_for_each_entry((plane), &(dev)->mode_config.plane_list, head) \ for_each_if ((plane_mask) & drm_plane_mask(plane)) /** * drm_for_each_legacy_plane - iterate over all planes for legacy userspace * @plane: the loop cursor * @dev: the DRM device * * Iterate over all legacy planes of @dev, excluding primary and cursor planes. * This is useful for implementing userspace apis when userspace is not * universal plane aware. See also &enum drm_plane_type. */ #define drm_for_each_legacy_plane(plane, dev) \ list_for_each_entry(plane, &(dev)->mode_config.plane_list, head) \ for_each_if (plane->type == DRM_PLANE_TYPE_OVERLAY) /** * drm_for_each_plane - iterate over all planes * @plane: the loop cursor * @dev: the DRM device * * Iterate over all planes of @dev, include primary and cursor planes. */ #define drm_for_each_plane(plane, dev) \ list_for_each_entry(plane, &(dev)->mode_config.plane_list, head) bool drm_plane_has_format(struct drm_plane *plane, u32 format, u64 modifier); bool drm_any_plane_has_format(struct drm_device *dev, u32 format, u64 modifier); void drm_plane_enable_fb_damage_clips(struct drm_plane *plane); unsigned int drm_plane_get_damage_clips_count(const struct drm_plane_state *state); struct drm_mode_rect * drm_plane_get_damage_clips(const struct drm_plane_state *state); int drm_plane_create_scaling_filter_property(struct drm_plane *plane, unsigned int supported_filters); int drm_plane_add_size_hints_property(struct drm_plane *plane, const struct drm_plane_size_hint *hints, int num_hints); int drm_plane_create_color_pipeline_property(struct drm_plane *plane, const struct drm_prop_enum_list *pipelines, int num_pipelines); #endif
2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 // SPDX-License-Identifier: GPL-2.0-or-later /* * acpi_bus.c - ACPI Bus Driver ($Revision: 80 $) * * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> */ #define pr_fmt(fmt) "ACPI: " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/ioport.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/sched.h> #include <linux/pm.h> #include <linux/device.h> #include <linux/proc_fs.h> #include <linux/acpi.h> #include <linux/slab.h> #include <linux/regulator/machine.h> #include <linux/workqueue.h> #include <linux/reboot.h> #include <linux/delay.h> #ifdef CONFIG_X86 #include <asm/mpspec.h> #include <linux/dmi.h> #endif #include <linux/acpi_viot.h> #include <linux/pci.h> #include <acpi/apei.h> #include <linux/suspend.h> #include <linux/prmt.h> #include "internal.h" struct acpi_device *acpi_root; struct proc_dir_entry *acpi_root_dir; EXPORT_SYMBOL(acpi_root_dir); #ifdef CONFIG_X86 #ifdef CONFIG_ACPI_CUSTOM_DSDT static inline int set_copy_dsdt(const struct dmi_system_id *id) { return 0; } #else static int set_copy_dsdt(const struct dmi_system_id *id) { pr_notice("%s detected - force copy of DSDT to local memory\n", id->ident); acpi_gbl_copy_dsdt_locally = 1; return 0; } #endif static const struct dmi_system_id dsdt_dmi_table[] __initconst = { /* * Invoke DSDT corruption work-around on all Toshiba Satellite. * https://bugzilla.kernel.org/show_bug.cgi?id=14679 */ { .callback = set_copy_dsdt, .ident = "TOSHIBA Satellite", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"), DMI_MATCH(DMI_PRODUCT_NAME, "Satellite"), }, }, {} }; #endif /* -------------------------------------------------------------------------- Device Management -------------------------------------------------------------------------- */ acpi_status acpi_bus_get_status_handle(acpi_handle handle, unsigned long long *sta) { acpi_status status; status = acpi_evaluate_integer(handle, "_STA", NULL, sta); if (ACPI_SUCCESS(status)) return AE_OK; if (status == AE_NOT_FOUND) { *sta = ACPI_STA_DEVICE_PRESENT | ACPI_STA_DEVICE_ENABLED | ACPI_STA_DEVICE_UI | ACPI_STA_DEVICE_FUNCTIONING; return AE_OK; } return status; } EXPORT_SYMBOL_GPL(acpi_bus_get_status_handle); int acpi_bus_get_status(struct acpi_device *device) { acpi_status status; unsigned long long sta; if (acpi_device_override_status(device, &sta)) { acpi_set_device_status(device, sta); return 0; } /* Battery devices must have their deps met before calling _STA */ if (acpi_device_is_battery(device) && device->dep_unmet) { acpi_set_device_status(device, 0); return 0; } status = acpi_bus_get_status_handle(device->handle, &sta); if (ACPI_FAILURE(status)) return -ENODEV; if (!device->status.present && device->status.enabled) { pr_info(FW_BUG "Device [%s] status [%08x]: not present and enabled\n", device->pnp.bus_id, (u32)sta); device->status.enabled = 0; /* * The status is clearly invalid, so clear the functional bit as * well to avoid attempting to use the device. */ device->status.functional = 0; } acpi_set_device_status(device, sta); if (device->status.functional && !device->status.present) { pr_debug("Device [%s] status [%08x]: functional but not present\n", device->pnp.bus_id, (u32)sta); } pr_debug("Device [%s] status [%08x]\n", device->pnp.bus_id, (u32)sta); return 0; } EXPORT_SYMBOL(acpi_bus_get_status); void acpi_bus_private_data_handler(acpi_handle handle, void *context) { return; } EXPORT_SYMBOL(acpi_bus_private_data_handler); int acpi_bus_attach_private_data(acpi_handle handle, void *data) { acpi_status status; status = acpi_attach_data(handle, acpi_bus_private_data_handler, data); if (ACPI_FAILURE(status)) { acpi_handle_debug(handle, "Error attaching device data\n"); return -ENODEV; } return 0; } EXPORT_SYMBOL_GPL(acpi_bus_attach_private_data); int acpi_bus_get_private_data(acpi_handle handle, void **data) { acpi_status status; if (!data) return -EINVAL; status = acpi_get_data(handle, acpi_bus_private_data_handler, data); if (ACPI_FAILURE(status)) { acpi_handle_debug(handle, "No context for object\n"); return -ENODEV; } return 0; } EXPORT_SYMBOL_GPL(acpi_bus_get_private_data); void acpi_bus_detach_private_data(acpi_handle handle) { acpi_detach_data(handle, acpi_bus_private_data_handler); } EXPORT_SYMBOL_GPL(acpi_bus_detach_private_data); static void acpi_print_osc_error(acpi_handle handle, struct acpi_osc_context *context, char *error) { int i; acpi_handle_debug(handle, "(%s): %s\n", context->uuid_str, error); pr_debug("_OSC request data:"); for (i = 0; i < context->cap.length; i += sizeof(u32)) pr_debug(" %x", *((u32 *)(context->cap.pointer + i))); pr_debug("\n"); } acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context) { acpi_status status; struct acpi_object_list input; union acpi_object in_params[4]; union acpi_object *out_obj; guid_t guid; u32 errors; struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; if (!context) return AE_ERROR; if (guid_parse(context->uuid_str, &guid)) return AE_ERROR; context->ret.length = ACPI_ALLOCATE_BUFFER; context->ret.pointer = NULL; /* Setting up input parameters */ input.count = 4; input.pointer = in_params; in_params[0].type = ACPI_TYPE_BUFFER; in_params[0].buffer.length = 16; in_params[0].buffer.pointer = (u8 *)&guid; in_params[1].type = ACPI_TYPE_INTEGER; in_params[1].integer.value = context->rev; in_params[2].type = ACPI_TYPE_INTEGER; in_params[2].integer.value = context->cap.length/sizeof(u32); in_params[3].type = ACPI_TYPE_BUFFER; in_params[3].buffer.length = context->cap.length; in_params[3].buffer.pointer = context->cap.pointer; status = acpi_evaluate_object(handle, "_OSC", &input, &output); if (ACPI_FAILURE(status)) return status; if (!output.length) return AE_NULL_OBJECT; out_obj = output.pointer; if (out_obj->type != ACPI_TYPE_BUFFER || out_obj->buffer.length != context->cap.length) { acpi_print_osc_error(handle, context, "_OSC evaluation returned wrong type"); status = AE_TYPE; goto out_kfree; } /* Need to ignore the bit0 in result code */ errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); if (errors) { if (errors & OSC_REQUEST_ERROR) acpi_print_osc_error(handle, context, "_OSC request failed"); if (errors & OSC_INVALID_UUID_ERROR) acpi_print_osc_error(handle, context, "_OSC invalid UUID"); if (errors & OSC_INVALID_REVISION_ERROR) acpi_print_osc_error(handle, context, "_OSC invalid revision"); if (errors & OSC_CAPABILITIES_MASK_ERROR) { if (((u32 *)context->cap.pointer)[OSC_QUERY_DWORD] & OSC_QUERY_ENABLE) goto out_success; status = AE_SUPPORT; goto out_kfree; } status = AE_ERROR; goto out_kfree; } out_success: context->ret.length = out_obj->buffer.length; context->ret.pointer = kmemdup(out_obj->buffer.pointer, context->ret.length, GFP_KERNEL); if (!context->ret.pointer) { status = AE_NO_MEMORY; goto out_kfree; } status = AE_OK; out_kfree: kfree(output.pointer); return status; } EXPORT_SYMBOL(acpi_run_osc); bool osc_sb_apei_support_acked; /* * ACPI 6.0 Section 8.4.4.2 Idle State Coordination * OSPM supports platform coordinated low power idle(LPI) states */ bool osc_pc_lpi_support_confirmed; EXPORT_SYMBOL_GPL(osc_pc_lpi_support_confirmed); /* * ACPI 6.2 Section 6.2.11.2 'Platform-Wide OSPM Capabilities': * Starting with ACPI Specification 6.2, all _CPC registers can be in * PCC, System Memory, System IO, or Functional Fixed Hardware address * spaces. OSPM support for this more flexible register space scheme is * indicated by the “Flexible Address Space for CPPC Registers” _OSC bit. * * Otherwise (cf ACPI 6.1, s8.4.7.1.1.X), _CPC registers must be in: * - PCC or Functional Fixed Hardware address space if defined * - SystemMemory address space (NULL register) if not defined */ bool osc_cpc_flexible_adr_space_confirmed; EXPORT_SYMBOL_GPL(osc_cpc_flexible_adr_space_confirmed); /* * ACPI 6.4 Operating System Capabilities for USB. */ bool osc_sb_native_usb4_support_confirmed; EXPORT_SYMBOL_GPL(osc_sb_native_usb4_support_confirmed); bool osc_sb_cppc2_support_acked; static u8 sb_uuid_str[] = "0811B06E-4A27-44F9-8D60-3CBBC22E7B48"; static void acpi_bus_osc_negotiate_platform_control(void) { u32 capbuf[2], *capbuf_ret; struct acpi_osc_context context = { .uuid_str = sb_uuid_str, .rev = 1, .cap.length = 8, .cap.pointer = capbuf, }; acpi_handle handle; capbuf[OSC_QUERY_DWORD] = OSC_QUERY_ENABLE; capbuf[OSC_SUPPORT_DWORD] = OSC_SB_PR3_SUPPORT; /* _PR3 is in use */ if (IS_ENABLED(CONFIG_ACPI_PROCESSOR_AGGREGATOR)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PAD_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_PROCESSOR)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PPC_OST_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_THERMAL)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_FAST_THERMAL_SAMPLING_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_BATTERY)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_BATTERY_CHARGE_LIMITING_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_OVER_16_PSTATES_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GED_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_IRQ_RESOURCE_SOURCE_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_PRMT)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PRM_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_FFH)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_FFH_OPR_SUPPORT; #ifdef CONFIG_ARM64 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GENERIC_INITIATOR_SUPPORT; #endif #ifdef CONFIG_X86 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GENERIC_INITIATOR_SUPPORT; #endif #ifdef CONFIG_ACPI_CPPC_LIB capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPCV2_SUPPORT; #endif capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_FLEXIBLE_ADR_SPACE; if (IS_ENABLED(CONFIG_SCHED_MC_PRIO)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_DIVERSE_HIGH_SUPPORT; if (IS_ENABLED(CONFIG_USB4)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_NATIVE_USB4_SUPPORT; if (!ghes_disable) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_APEI_SUPPORT; if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) return; if (ACPI_FAILURE(acpi_run_osc(handle, &context))) return; capbuf_ret = context.ret.pointer; if (context.ret.length <= OSC_SUPPORT_DWORD) { kfree(context.ret.pointer); return; } /* * Now run _OSC again with query flag clear and with the caps * supported by both the OS and the platform. */ capbuf[OSC_QUERY_DWORD] = 0; capbuf[OSC_SUPPORT_DWORD] = capbuf_ret[OSC_SUPPORT_DWORD]; kfree(context.ret.pointer); if (ACPI_FAILURE(acpi_run_osc(handle, &context))) return; capbuf_ret = context.ret.pointer; if (context.ret.length > OSC_SUPPORT_DWORD) { #ifdef CONFIG_ACPI_CPPC_LIB osc_sb_cppc2_support_acked = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_CPCV2_SUPPORT; #endif osc_sb_apei_support_acked = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_APEI_SUPPORT; osc_pc_lpi_support_confirmed = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_PCLPI_SUPPORT; osc_sb_native_usb4_support_confirmed = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_NATIVE_USB4_SUPPORT; osc_cpc_flexible_adr_space_confirmed = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_CPC_FLEXIBLE_ADR_SPACE; } kfree(context.ret.pointer); } /* * Native control of USB4 capabilities. If any of the tunneling bits is * set it means OS is in control and we use software based connection * manager. */ u32 osc_sb_native_usb4_control; EXPORT_SYMBOL_GPL(osc_sb_native_usb4_control); static void acpi_bus_decode_usb_osc(const char *msg, u32 bits) { pr_info("%s USB3%c DisplayPort%c PCIe%c XDomain%c\n", msg, (bits & OSC_USB_USB3_TUNNELING) ? '+' : '-', (bits & OSC_USB_DP_TUNNELING) ? '+' : '-', (bits & OSC_USB_PCIE_TUNNELING) ? '+' : '-', (bits & OSC_USB_XDOMAIN) ? '+' : '-'); } static u8 sb_usb_uuid_str[] = "23A0D13A-26AB-486C-9C5F-0FFA525A575A"; static void acpi_bus_osc_negotiate_usb_control(void) { u32 capbuf[3], *capbuf_ret; struct acpi_osc_context context = { .uuid_str = sb_usb_uuid_str, .rev = 1, .cap.length = sizeof(capbuf), .cap.pointer = capbuf, }; acpi_handle handle; acpi_status status; u32 control; if (!osc_sb_native_usb4_support_confirmed) return; if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) return; control = OSC_USB_USB3_TUNNELING | OSC_USB_DP_TUNNELING | OSC_USB_PCIE_TUNNELING | OSC_USB_XDOMAIN; /* * Run _OSC first with query bit set, trying to get control over * all tunneling. The platform can then clear out bits in the * control dword that it does not want to grant to the OS. */ capbuf[OSC_QUERY_DWORD] = OSC_QUERY_ENABLE; capbuf[OSC_SUPPORT_DWORD] = 0; capbuf[OSC_CONTROL_DWORD] = control; status = acpi_run_osc(handle, &context); if (ACPI_FAILURE(status)) return; if (context.ret.length != sizeof(capbuf)) { pr_info("USB4 _OSC: returned invalid length buffer\n"); goto out_free; } /* * Run _OSC again now with query bit clear and the control dword * matching what the platform granted (which may not have all * the control bits set). */ capbuf_ret = context.ret.pointer; capbuf[OSC_QUERY_DWORD] = 0; capbuf[OSC_CONTROL_DWORD] = capbuf_ret[OSC_CONTROL_DWORD]; kfree(context.ret.pointer); status = acpi_run_osc(handle, &context); if (ACPI_FAILURE(status)) return; if (context.ret.length != sizeof(capbuf)) { pr_info("USB4 _OSC: returned invalid length buffer\n"); goto out_free; } osc_sb_native_usb4_control = control & acpi_osc_ctx_get_pci_control(&context); acpi_bus_decode_usb_osc("USB4 _OSC: OS supports", control); acpi_bus_decode_usb_osc("USB4 _OSC: OS controls", osc_sb_native_usb4_control); out_free: kfree(context.ret.pointer); } /* -------------------------------------------------------------------------- Notification Handling -------------------------------------------------------------------------- */ /** * acpi_bus_notify - Global system-level (0x00-0x7F) notifications handler * @handle: Target ACPI object. * @type: Notification type. * @data: Ignored. * * This only handles notifications related to device hotplug. */ static void acpi_bus_notify(acpi_handle handle, u32 type, void *data) { struct acpi_device *adev; switch (type) { case ACPI_NOTIFY_BUS_CHECK: acpi_handle_debug(handle, "ACPI_NOTIFY_BUS_CHECK event\n"); break; case ACPI_NOTIFY_DEVICE_CHECK: acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_CHECK event\n"); break; case ACPI_NOTIFY_DEVICE_WAKE: acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_WAKE event\n"); return; case ACPI_NOTIFY_EJECT_REQUEST: acpi_handle_debug(handle, "ACPI_NOTIFY_EJECT_REQUEST event\n"); break; case ACPI_NOTIFY_DEVICE_CHECK_LIGHT: acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_CHECK_LIGHT event\n"); /* TBD: Exactly what does 'light' mean? */ return; case ACPI_NOTIFY_FREQUENCY_MISMATCH: acpi_handle_err(handle, "Device cannot be configured due " "to a frequency mismatch\n"); return; case ACPI_NOTIFY_BUS_MODE_MISMATCH: acpi_handle_err(handle, "Device cannot be configured due " "to a bus mode mismatch\n"); return; case ACPI_NOTIFY_POWER_FAULT: acpi_handle_err(handle, "Device has suffered a power fault\n"); return; default: acpi_handle_debug(handle, "Unknown event type 0x%x\n", type); return; } adev = acpi_get_acpi_dev(handle); if (adev && ACPI_SUCCESS(acpi_hotplug_schedule(adev, type))) return; acpi_put_acpi_dev(adev); acpi_evaluate_ost(handle, type, ACPI_OST_SC_NON_SPECIFIC_FAILURE, NULL); } static void acpi_notify_device(acpi_handle handle, u32 event, void *data) { struct acpi_device *device = data; struct acpi_driver *acpi_drv = to_acpi_driver(device->dev.driver); acpi_drv->ops.notify(device, event); } static int acpi_device_install_notify_handler(struct acpi_device *device, struct acpi_driver *acpi_drv) { u32 type = acpi_drv->flags & ACPI_DRIVER_ALL_NOTIFY_EVENTS ? ACPI_ALL_NOTIFY : ACPI_DEVICE_NOTIFY; acpi_status status; status = acpi_install_notify_handler(device->handle, type, acpi_notify_device, device); if (ACPI_FAILURE(status)) return -EINVAL; return 0; } static void acpi_device_remove_notify_handler(struct acpi_device *device, struct acpi_driver *acpi_drv) { u32 type = acpi_drv->flags & ACPI_DRIVER_ALL_NOTIFY_EVENTS ? ACPI_ALL_NOTIFY : ACPI_DEVICE_NOTIFY; acpi_remove_notify_handler(device->handle, type, acpi_notify_device); acpi_os_wait_events_complete(); } int acpi_dev_install_notify_handler(struct acpi_device *adev, u32 handler_type, acpi_notify_handler handler, void *context) { acpi_status status; status = acpi_install_notify_handler(adev->handle, handler_type, handler, context); if (ACPI_FAILURE(status)) return -ENODEV; return 0; } EXPORT_SYMBOL_GPL(acpi_dev_install_notify_handler); void acpi_dev_remove_notify_handler(struct acpi_device *adev, u32 handler_type, acpi_notify_handler handler) { acpi_remove_notify_handler(adev->handle, handler_type, handler); acpi_os_wait_events_complete(); } EXPORT_SYMBOL_GPL(acpi_dev_remove_notify_handler); /* Handle events targeting \_SB device (at present only graceful shutdown) */ #define ACPI_SB_NOTIFY_SHUTDOWN_REQUEST 0x81 #define ACPI_SB_INDICATE_INTERVAL 10000 static void sb_notify_work(struct work_struct *dummy) { acpi_handle sb_handle; orderly_poweroff(true); /* * After initiating graceful shutdown, the ACPI spec requires OSPM * to evaluate _OST method once every 10seconds to indicate that * the shutdown is in progress */ acpi_get_handle(NULL, "\\_SB", &sb_handle); while (1) { pr_info("Graceful shutdown in progress.\n"); acpi_evaluate_ost(sb_handle, ACPI_OST_EC_OSPM_SHUTDOWN, ACPI_OST_SC_OS_SHUTDOWN_IN_PROGRESS, NULL); msleep(ACPI_SB_INDICATE_INTERVAL); } } static void acpi_sb_notify(acpi_handle handle, u32 event, void *data) { static DECLARE_WORK(acpi_sb_work, sb_notify_work); if (event == ACPI_SB_NOTIFY_SHUTDOWN_REQUEST) { if (!work_busy(&acpi_sb_work)) schedule_work(&acpi_sb_work); } else { pr_warn("event %x is not supported by \\_SB device\n", event); } } static int __init acpi_setup_sb_notify_handler(void) { acpi_handle sb_handle; if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &sb_handle))) return -ENXIO; if (ACPI_FAILURE(acpi_install_notify_handler(sb_handle, ACPI_DEVICE_NOTIFY, acpi_sb_notify, NULL))) return -EINVAL; return 0; } /* -------------------------------------------------------------------------- Device Matching -------------------------------------------------------------------------- */ /** * acpi_get_first_physical_node - Get first physical node of an ACPI device * @adev: ACPI device in question * * Return: First physical node of ACPI device @adev */ struct device *acpi_get_first_physical_node(struct acpi_device *adev) { struct mutex *physical_node_lock = &adev->physical_node_lock; struct device *phys_dev; mutex_lock(physical_node_lock); if (list_empty(&adev->physical_node_list)) { phys_dev = NULL; } else { const struct acpi_device_physical_node *node; node = list_first_entry(&adev->physical_node_list, struct acpi_device_physical_node, node); phys_dev = node->dev; } mutex_unlock(physical_node_lock); return phys_dev; } EXPORT_SYMBOL_GPL(acpi_get_first_physical_node); static struct acpi_device *acpi_primary_dev_companion(struct acpi_device *adev, const struct device *dev) { const struct device *phys_dev = acpi_get_first_physical_node(adev); return phys_dev && phys_dev == dev ? adev : NULL; } /** * acpi_device_is_first_physical_node - Is given dev first physical node * @adev: ACPI companion device * @dev: Physical device to check * * Function checks if given @dev is the first physical devices attached to * the ACPI companion device. This distinction is needed in some cases * where the same companion device is shared between many physical devices. * * Note that the caller have to provide valid @adev pointer. */ bool acpi_device_is_first_physical_node(struct acpi_device *adev, const struct device *dev) { return !!acpi_primary_dev_companion(adev, dev); } /* * acpi_companion_match() - Can we match via ACPI companion device * @dev: Device in question * * Check if the given device has an ACPI companion and if that companion has * a valid list of PNP IDs, and if the device is the first (primary) physical * device associated with it. Return the companion pointer if that's the case * or NULL otherwise. * * If multiple physical devices are attached to a single ACPI companion, we need * to be careful. The usage scenario for this kind of relationship is that all * of the physical devices in question use resources provided by the ACPI * companion. A typical case is an MFD device where all the sub-devices share * the parent's ACPI companion. In such cases we can only allow the primary * (first) physical device to be matched with the help of the companion's PNP * IDs. * * Additional physical devices sharing the ACPI companion can still use * resources available from it but they will be matched normally using functions * provided by their bus types (and analogously for their modalias). */ const struct acpi_device *acpi_companion_match(const struct device *dev) { struct acpi_device *adev; adev = ACPI_COMPANION(dev); if (!adev) return NULL; if (list_empty(&adev->pnp.ids)) return NULL; return acpi_primary_dev_companion(adev, dev); } /** * acpi_of_match_device - Match device object using the "compatible" property. * @adev: ACPI device object to match. * @of_match_table: List of device IDs to match against. * @of_id: OF ID if matched * * If @dev has an ACPI companion which has ACPI_DT_NAMESPACE_HID in its list of * identifiers and a _DSD object with the "compatible" property, use that * property to match against the given list of identifiers. */ static bool acpi_of_match_device(const struct acpi_device *adev, const struct of_device_id *of_match_table, const struct of_device_id **of_id) { const union acpi_object *of_compatible, *obj; int i, nval; if (!adev) return false; of_compatible = adev->data.of_compatible; if (!of_match_table || !of_compatible) return false; if (of_compatible->type == ACPI_TYPE_PACKAGE) { nval = of_compatible->package.count; obj = of_compatible->package.elements; } else { /* Must be ACPI_TYPE_STRING. */ nval = 1; obj = of_compatible; } /* Now we can look for the driver DT compatible strings */ for (i = 0; i < nval; i++, obj++) { const struct of_device_id *id; for (id = of_match_table; id->compatible[0]; id++) if (!strcasecmp(obj->string.pointer, id->compatible)) { if (of_id) *of_id = id; return true; } } return false; } static bool acpi_of_modalias(struct acpi_device *adev, char *modalias, size_t len) { const union acpi_object *of_compatible; const union acpi_object *obj; const char *str, *chr; of_compatible = adev->data.of_compatible; if (!of_compatible) return false; if (of_compatible->type == ACPI_TYPE_PACKAGE) obj = of_compatible->package.elements; else /* Must be ACPI_TYPE_STRING. */ obj = of_compatible; str = obj->string.pointer; chr = strchr(str, ','); strscpy(modalias, chr ? chr + 1 : str, len); return true; } /** * acpi_set_modalias - Set modalias using "compatible" property or supplied ID * @adev: ACPI device object to match * @default_id: ID string to use as default if no compatible string found * @modalias: Pointer to buffer that modalias value will be copied into * @len: Length of modalias buffer * * This is a counterpart of of_alias_from_compatible() for struct acpi_device * objects. If there is a compatible string for @adev, it will be copied to * @modalias with the vendor prefix stripped; otherwise, @default_id will be * used. */ void acpi_set_modalias(struct acpi_device *adev, const char *default_id, char *modalias, size_t len) { if (!acpi_of_modalias(adev, modalias, len)) strscpy(modalias, default_id, len); } EXPORT_SYMBOL_GPL(acpi_set_modalias); static bool __acpi_match_device_cls(const struct acpi_device_id *id, struct acpi_hardware_id *hwid) { int i, msk, byte_shift; char buf[3]; if (!id->cls) return false; /* Apply class-code bitmask, before checking each class-code byte */ for (i = 1; i <= 3; i++) { byte_shift = 8 * (3 - i); msk = (id->cls_msk >> byte_shift) & 0xFF; if (!msk) continue; sprintf(buf, "%02x", (id->cls >> byte_shift) & msk); if (strncmp(buf, &hwid->id[(i - 1) * 2], 2)) return false; } return true; } static bool __acpi_match_device(const struct acpi_device *device, const struct acpi_device_id *acpi_ids, const struct of_device_id *of_ids, const struct acpi_device_id **acpi_id, const struct of_device_id **of_id) { const struct acpi_device_id *id; struct acpi_hardware_id *hwid; /* * If the device is not present, it is unnecessary to load device * driver for it. */ if (!device || !device->status.present) return false; list_for_each_entry(hwid, &device->pnp.ids, list) { /* First, check the ACPI/PNP IDs provided by the caller. */ if (acpi_ids) { for (id = acpi_ids; id->id[0] || id->cls; id++) { if (id->id[0] && !strcmp((char *)id->id, hwid->id)) goto out_acpi_match; if (id->cls && __acpi_match_device_cls(id, hwid)) goto out_acpi_match; } } /* * Next, check ACPI_DT_NAMESPACE_HID and try to match the * "compatible" property if found. */ if (!strcmp(ACPI_DT_NAMESPACE_HID, hwid->id)) return acpi_of_match_device(device, of_ids, of_id); } return false; out_acpi_match: if (acpi_id) *acpi_id = id; return true; } /** * acpi_match_acpi_device - Match an ACPI device against a given list of ACPI IDs * @ids: Array of struct acpi_device_id objects to match against. * @adev: The ACPI device pointer to match. * * Match the ACPI device @adev against a given list of ACPI IDs @ids. * * Return: * a pointer to the first matching ACPI ID on success or %NULL on failure. */ const struct acpi_device_id *acpi_match_acpi_device(const struct acpi_device_id *ids, const struct acpi_device *adev) { const struct acpi_device_id *id = NULL; __acpi_match_device(adev, ids, NULL, &id, NULL); return id; } EXPORT_SYMBOL_GPL(acpi_match_acpi_device); /** * acpi_match_device - Match a struct device against a given list of ACPI IDs * @ids: Array of struct acpi_device_id object to match against. * @dev: The device structure to match. * * Check if @dev has a valid ACPI handle and if there is a struct acpi_device * object for that handle and use that object to match against a given list of * device IDs. * * Return a pointer to the first matching ID on success or %NULL on failure. */ const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids, const struct device *dev) { return acpi_match_acpi_device(ids, acpi_companion_match(dev)); } EXPORT_SYMBOL_GPL(acpi_match_device); static const void *acpi_of_device_get_match_data(const struct device *dev) { struct acpi_device *adev = ACPI_COMPANION(dev); const struct of_device_id *match = NULL; if (!acpi_of_match_device(adev, dev->driver->of_match_table, &match)) return NULL; return match->data; } const void *acpi_device_get_match_data(const struct device *dev) { const struct acpi_device_id *acpi_ids = dev->driver->acpi_match_table; const struct acpi_device_id *match; if (!acpi_ids) return acpi_of_device_get_match_data(dev); match = acpi_match_device(acpi_ids, dev); if (!match) return NULL; return (const void *)match->driver_data; } EXPORT_SYMBOL_GPL(acpi_device_get_match_data); int acpi_match_device_ids(struct acpi_device *device, const struct acpi_device_id *ids) { return __acpi_match_device(device, ids, NULL, NULL, NULL) ? 0 : -ENOENT; } EXPORT_SYMBOL(acpi_match_device_ids); bool acpi_driver_match_device(struct device *dev, const struct device_driver *drv) { const struct acpi_device_id *acpi_ids = drv->acpi_match_table; const struct of_device_id *of_ids = drv->of_match_table; if (!acpi_ids) return acpi_of_match_device(ACPI_COMPANION(dev), of_ids, NULL); return __acpi_match_device(acpi_companion_match(dev), acpi_ids, of_ids, NULL, NULL); } EXPORT_SYMBOL_GPL(acpi_driver_match_device); /* -------------------------------------------------------------------------- ACPI Driver Management -------------------------------------------------------------------------- */ /** * __acpi_bus_register_driver - register a driver with the ACPI bus * @driver: driver being registered * @owner: owning module/driver * * Registers a driver with the ACPI bus. Searches the namespace for all * devices that match the driver's criteria and binds. Returns zero for * success or a negative error status for failure. */ int __acpi_bus_register_driver(struct acpi_driver *driver, struct module *owner) { if (acpi_disabled) return -ENODEV; driver->drv.name = driver->name; driver->drv.bus = &acpi_bus_type; driver->drv.owner = owner; return driver_register(&driver->drv); } EXPORT_SYMBOL(__acpi_bus_register_driver); /** * acpi_bus_unregister_driver - unregisters a driver with the ACPI bus * @driver: driver to unregister * * Unregisters a driver with the ACPI bus. Searches the namespace for all * devices that match the driver's criteria and unbinds. */ void acpi_bus_unregister_driver(struct acpi_driver *driver) { driver_unregister(&driver->drv); } EXPORT_SYMBOL(acpi_bus_unregister_driver); /* -------------------------------------------------------------------------- ACPI Bus operations -------------------------------------------------------------------------- */ static int acpi_bus_match(struct device *dev, const struct device_driver *drv) { struct acpi_device *acpi_dev = to_acpi_device(dev); const struct acpi_driver *acpi_drv = to_acpi_driver(drv); return acpi_dev->flags.match_driver && !acpi_match_device_ids(acpi_dev, acpi_drv->ids); } static int acpi_device_uevent(const struct device *dev, struct kobj_uevent_env *env) { return __acpi_device_uevent_modalias(to_acpi_device(dev), env); } static int acpi_device_probe(struct device *dev) { struct acpi_device *acpi_dev = to_acpi_device(dev); struct acpi_driver *acpi_drv = to_acpi_driver(dev->driver); int ret; if (acpi_dev->handler && !acpi_is_pnp_device(acpi_dev)) return -EINVAL; if (!acpi_drv->ops.add) return -ENOSYS; ret = acpi_drv->ops.add(acpi_dev); if (ret) { acpi_dev->driver_data = NULL; return ret; } pr_debug("Driver [%s] successfully bound to device [%s]\n", acpi_drv->name, acpi_dev->pnp.bus_id); if (acpi_drv->ops.notify) { ret = acpi_device_install_notify_handler(acpi_dev, acpi_drv); if (ret) { if (acpi_drv->ops.remove) acpi_drv->ops.remove(acpi_dev); acpi_dev->driver_data = NULL; return ret; } } pr_debug("Found driver [%s] for device [%s]\n", acpi_drv->name, acpi_dev->pnp.bus_id); get_device(dev); return 0; } static void acpi_device_remove(struct device *dev) { struct acpi_device *acpi_dev = to_acpi_device(dev); struct acpi_driver *acpi_drv = to_acpi_driver(dev->driver); if (acpi_drv->ops.notify) acpi_device_remove_notify_handler(acpi_dev, acpi_drv); if (acpi_drv->ops.remove) acpi_drv->ops.remove(acpi_dev); acpi_dev->driver_data = NULL; put_device(dev); } const struct bus_type acpi_bus_type = { .name = "acpi", .match = acpi_bus_match, .probe = acpi_device_probe, .remove = acpi_device_remove, .uevent = acpi_device_uevent, }; int acpi_bus_for_each_dev(int (*fn)(struct device *, void *), void *data) { return bus_for_each_dev(&acpi_bus_type, NULL, data, fn); } EXPORT_SYMBOL_GPL(acpi_bus_for_each_dev); struct acpi_dev_walk_context { int (*fn)(struct acpi_device *, void *); void *data; }; static int acpi_dev_for_one_check(struct device *dev, void *context) { struct acpi_dev_walk_context *adwc = context; if (dev->bus != &acpi_bus_type) return 0; return adwc->fn(to_acpi_device(dev), adwc->data); } EXPORT_SYMBOL_GPL(acpi_dev_for_each_child); int acpi_dev_for_each_child(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data) { struct acpi_dev_walk_context adwc = { .fn = fn, .data = data, }; return device_for_each_child(&adev->dev, &adwc, acpi_dev_for_one_check); } int acpi_dev_for_each_child_reverse(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data) { struct acpi_dev_walk_context adwc = { .fn = fn, .data = data, }; return device_for_each_child_reverse(&adev->dev, &adwc, acpi_dev_for_one_check); } /* -------------------------------------------------------------------------- Initialization/Cleanup -------------------------------------------------------------------------- */ static int __init acpi_bus_init_irq(void) { acpi_status status; char *message = NULL; /* * Let the system know what interrupt model we are using by * evaluating the \_PIC object, if exists. */ switch (acpi_irq_model) { case ACPI_IRQ_MODEL_PIC: message = "PIC"; break; case ACPI_IRQ_MODEL_IOAPIC: message = "IOAPIC"; break; case ACPI_IRQ_MODEL_IOSAPIC: message = "IOSAPIC"; break; case ACPI_IRQ_MODEL_GIC: message = "GIC"; break; case ACPI_IRQ_MODEL_PLATFORM: message = "platform specific model"; break; case ACPI_IRQ_MODEL_LPIC: message = "LPIC"; break; case ACPI_IRQ_MODEL_RINTC: message = "RINTC"; break; default: pr_info("Unknown interrupt routing model\n"); return -ENODEV; } pr_info("Using %s for interrupt routing\n", message); status = acpi_execute_simple_method(NULL, "\\_PIC", acpi_irq_model); if (ACPI_FAILURE(status) && (status != AE_NOT_FOUND)) { pr_info("_PIC evaluation failed: %s\n", acpi_format_exception(status)); return -ENODEV; } return 0; } /** * acpi_early_init - Initialize ACPICA and populate the ACPI namespace. * * The ACPI tables are accessible after this, but the handling of events has not * been initialized and the global lock is not available yet, so AML should not * be executed at this point. * * Doing this before switching the EFI runtime services to virtual mode allows * the EfiBootServices memory to be freed slightly earlier on boot. */ void __init acpi_early_init(void) { acpi_status status; if (acpi_disabled) return; pr_info("Core revision %08x\n", ACPI_CA_VERSION); /* enable workarounds, unless strict ACPI spec. compliance */ if (!acpi_strict) acpi_gbl_enable_interpreter_slack = TRUE; acpi_permanent_mmap = true; #ifdef CONFIG_X86 /* * If the machine falls into the DMI check table, * DSDT will be copied to memory. * Note that calling dmi_check_system() here on other architectures * would not be OK because only x86 initializes dmi early enough. * Thankfully only x86 systems need such quirks for now. */ dmi_check_system(dsdt_dmi_table); #endif status = acpi_reallocate_root_table(); if (ACPI_FAILURE(status)) { pr_err("Unable to reallocate ACPI tables\n"); goto error0; } status = acpi_initialize_subsystem(); if (ACPI_FAILURE(status)) { pr_err("Unable to initialize the ACPI Interpreter\n"); goto error0; } #ifdef CONFIG_X86 if (!acpi_ioapic) { /* compatible (0) means level (3) */ if (!(acpi_sci_flags & ACPI_MADT_TRIGGER_MASK)) { acpi_sci_flags &= ~ACPI_MADT_TRIGGER_MASK; acpi_sci_flags |= ACPI_MADT_TRIGGER_LEVEL; } /* Set PIC-mode SCI trigger type */ acpi_pic_sci_set_trigger(acpi_gbl_FADT.sci_interrupt, (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) >> 2); } else { /* * now that acpi_gbl_FADT is initialized, * update it with result from INT_SRC_OVR parsing */ acpi_gbl_FADT.sci_interrupt = acpi_sci_override_gsi; } #endif return; error0: disable_acpi(); } /** * acpi_subsystem_init - Finalize the early initialization of ACPI. * * Switch over the platform to the ACPI mode (if possible). * * Doing this too early is generally unsafe, but at the same time it needs to be * done before all things that really depend on ACPI. The right spot appears to * be before finalizing the EFI initialization. */ void __init acpi_subsystem_init(void) { acpi_status status; if (acpi_disabled) return; status = acpi_enable_subsystem(~ACPI_NO_ACPI_ENABLE); if (ACPI_FAILURE(status)) { pr_err("Unable to enable ACPI\n"); disable_acpi(); } else { /* * If the system is using ACPI then we can be reasonably * confident that any regulators are managed by the firmware * so tell the regulator core it has everything it needs to * know. */ regulator_has_full_constraints(); } } static acpi_status acpi_bus_table_handler(u32 event, void *table, void *context) { if (event == ACPI_TABLE_EVENT_LOAD) acpi_scan_table_notify(); return acpi_sysfs_table_handler(event, table, context); } static int __init acpi_bus_init(void) { int result; acpi_status status; acpi_os_initialize1(); status = acpi_load_tables(); if (ACPI_FAILURE(status)) { pr_err("Unable to load the System Description Tables\n"); goto error1; } /* * ACPI 2.0 requires the EC driver to be loaded and work before the EC * device is found in the namespace. * * This is accomplished by looking for the ECDT table and getting the EC * parameters out of that. * * Do that before calling acpi_initialize_objects() which may trigger EC * address space accesses. */ acpi_ec_ecdt_probe(); status = acpi_enable_subsystem(ACPI_NO_ACPI_ENABLE); if (ACPI_FAILURE(status)) { pr_err("Unable to start the ACPI Interpreter\n"); goto error1; } status = acpi_initialize_objects(ACPI_FULL_INITIALIZATION); if (ACPI_FAILURE(status)) { pr_err("Unable to initialize ACPI objects\n"); goto error1; } /* * _OSC method may exist in module level code, * so it must be run after ACPI_FULL_INITIALIZATION */ acpi_bus_osc_negotiate_platform_control(); acpi_bus_osc_negotiate_usb_control(); /* * _PDC control method may load dynamic SSDT tables, * and we need to install the table handler before that. */ status = acpi_install_table_handler(acpi_bus_table_handler, NULL); acpi_sysfs_init(); acpi_early_processor_control_setup(); /* * Maybe EC region is required at bus_scan/acpi_get_devices. So it * is necessary to enable it as early as possible. */ acpi_ec_dsdt_probe(); pr_info("Interpreter enabled\n"); /* Initialize sleep structures */ acpi_sleep_init(); /* * Get the system interrupt model and evaluate \_PIC. */ result = acpi_bus_init_irq(); if (result) goto error1; /* * Register for all standard device notifications. */ status = acpi_install_notify_handler(ACPI_ROOT_OBJECT, ACPI_SYSTEM_NOTIFY, &acpi_bus_notify, NULL); if (ACPI_FAILURE(status)) { pr_err("Unable to register for system notifications\n"); goto error1; } /* * Create the top ACPI proc directory */ acpi_root_dir = proc_mkdir(ACPI_BUS_FILE_ROOT, NULL); result = bus_register(&acpi_bus_type); if (!result) return 0; /* Mimic structured exception handling */ error1: acpi_terminate(); return -ENODEV; } struct kobject *acpi_kobj; EXPORT_SYMBOL_GPL(acpi_kobj); void __weak __init acpi_arch_init(void) { } static int __init acpi_init(void) { int result; if (acpi_disabled) { pr_info("Interpreter disabled.\n"); return -ENODEV; } acpi_kobj = kobject_create_and_add("acpi", firmware_kobj); if (!acpi_kobj) { pr_err("Failed to register kobject\n"); return -ENOMEM; } init_prmt(); acpi_init_pcc(); result = acpi_bus_init(); if (result) { kobject_put(acpi_kobj); disable_acpi(); return result; } acpi_init_ffh(); pci_mmcfg_late_init(); acpi_viot_early_init(); acpi_hest_init(); acpi_ghes_init(); acpi_arch_init(); acpi_scan_init(); acpi_ec_init(); acpi_debugfs_init(); acpi_sleep_proc_init(); acpi_wakeup_device_init(); acpi_debugger_init(); acpi_setup_sb_notify_handler(); acpi_viot_init(); return 0; } subsys_initcall(acpi_init);
2 2 2 2 2 2 1 7 8 8 8 8 5 8 8 8 1 1 18 17 10 8 8 7 7 7 7 4 3 2 4 1 3 10 10 10 9 1 9 9 9 8 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 /* * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <net/sock.h> #include <linux/in.h> #include <linux/ipv6.h> #include <linux/if_arp.h> #include <linux/jhash.h> #include <linux/ratelimit.h> #include "rds.h" static struct rhashtable bind_hash_table; static const struct rhashtable_params ht_parms = { .nelem_hint = 768, .key_len = RDS_BOUND_KEY_LEN, .key_offset = offsetof(struct rds_sock, rs_bound_key), .head_offset = offsetof(struct rds_sock, rs_bound_node), .max_size = 16384, .min_size = 1024, }; /* Create a key for the bind hash table manipulation. Port is in network byte * order. */ static inline void __rds_create_bind_key(u8 *key, const struct in6_addr *addr, __be16 port, __u32 scope_id) { memcpy(key, addr, sizeof(*addr)); key += sizeof(*addr); memcpy(key, &port, sizeof(port)); key += sizeof(port); memcpy(key, &scope_id, sizeof(scope_id)); } /* * Return the rds_sock bound at the given local address. * * The rx path can race with rds_release. We notice if rds_release() has * marked this socket and don't return a rs ref to the rx path. */ struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, __u32 scope_id) { u8 key[RDS_BOUND_KEY_LEN]; struct rds_sock *rs; __rds_create_bind_key(key, addr, port, scope_id); rcu_read_lock(); rs = rhashtable_lookup(&bind_hash_table, key, ht_parms); if (rs && (sock_flag(rds_rs_to_sk(rs), SOCK_DEAD) || !refcount_inc_not_zero(&rds_rs_to_sk(rs)->sk_refcnt))) rs = NULL; rcu_read_unlock(); rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr, ntohs(port)); return rs; } /* returns -ve errno or +ve port */ static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr, __be16 *port, __u32 scope_id) { int ret = -EADDRINUSE; u16 rover, last; u8 key[RDS_BOUND_KEY_LEN]; if (*port != 0) { rover = be16_to_cpu(*port); if (rover == RDS_FLAG_PROBE_PORT) return -EINVAL; last = rover; } else { rover = max_t(u16, get_random_u16(), 2); last = rover - 1; } do { if (rover == 0) rover++; if (rover == RDS_FLAG_PROBE_PORT) continue; __rds_create_bind_key(key, addr, cpu_to_be16(rover), scope_id); if (rhashtable_lookup_fast(&bind_hash_table, key, ht_parms)) continue; memcpy(rs->rs_bound_key, key, sizeof(rs->rs_bound_key)); rs->rs_bound_addr = *addr; net_get_random_once(&rs->rs_hash_initval, sizeof(rs->rs_hash_initval)); rs->rs_bound_port = cpu_to_be16(rover); rs->rs_bound_node.next = NULL; rds_sock_addref(rs); if (!rhashtable_insert_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms)) { *port = rs->rs_bound_port; rs->rs_bound_scope_id = scope_id; ret = 0; rdsdebug("rs %p binding to %pI6c:%d\n", rs, addr, (int)ntohs(*port)); break; } else { rs->rs_bound_addr = in6addr_any; rds_sock_put(rs); ret = -ENOMEM; break; } } while (rover++ != last); return ret; } void rds_remove_bound(struct rds_sock *rs) { if (ipv6_addr_any(&rs->rs_bound_addr)) return; rdsdebug("rs %p unbinding from %pI6c:%d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port)); rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms); rds_sock_put(rs); rs->rs_bound_addr = in6addr_any; } int rds_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); struct in6_addr v6addr, *binding_addr; struct rds_transport *trans; __u32 scope_id = 0; int ret = 0; __be16 port; /* We allow an RDS socket to be bound to either IPv4 or IPv6 * address. */ if (addr_len < offsetofend(struct sockaddr, sa_family)) return -EINVAL; if (uaddr->sa_family == AF_INET) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; if (addr_len < sizeof(struct sockaddr_in) || sin->sin_addr.s_addr == htonl(INADDR_ANY) || sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || ipv4_is_multicast(sin->sin_addr.s_addr)) return -EINVAL; ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr); binding_addr = &v6addr; port = sin->sin_port; #if IS_ENABLED(CONFIG_IPV6) } else if (uaddr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr; int addr_type; if (addr_len < sizeof(struct sockaddr_in6)) return -EINVAL; addr_type = ipv6_addr_type(&sin6->sin6_addr); if (!(addr_type & IPV6_ADDR_UNICAST)) { __be32 addr4; if (!(addr_type & IPV6_ADDR_MAPPED)) return -EINVAL; /* It is a mapped address. Need to do some sanity * checks. */ addr4 = sin6->sin6_addr.s6_addr32[3]; if (addr4 == htonl(INADDR_ANY) || addr4 == htonl(INADDR_BROADCAST) || ipv4_is_multicast(addr4)) return -EINVAL; } /* The scope ID must be specified for link local address. */ if (addr_type & IPV6_ADDR_LINKLOCAL) { if (sin6->sin6_scope_id == 0) return -EINVAL; scope_id = sin6->sin6_scope_id; } binding_addr = &sin6->sin6_addr; port = sin6->sin6_port; #endif } else { return -EINVAL; } lock_sock(sk); /* RDS socket does not allow re-binding. */ if (!ipv6_addr_any(&rs->rs_bound_addr)) { ret = -EINVAL; goto out; } /* Socket is connected. The binding address should have the same * scope ID as the connected address, except the case when one is * non-link local address (scope_id is 0). */ if (!ipv6_addr_any(&rs->rs_conn_addr) && scope_id && rs->rs_bound_scope_id && scope_id != rs->rs_bound_scope_id) { ret = -EINVAL; goto out; } /* The transport can be set using SO_RDS_TRANSPORT option before the * socket is bound. */ if (rs->rs_transport) { trans = rs->rs_transport; if (!trans->laddr_check || trans->laddr_check(sock_net(sock->sk), binding_addr, scope_id) != 0) { ret = -ENOPROTOOPT; goto out; } } else { trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr, scope_id); if (!trans) { ret = -EADDRNOTAVAIL; pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n", __func__, binding_addr); goto out; } rs->rs_transport = trans; } sock_set_flag(sk, SOCK_RCU_FREE); ret = rds_add_bound(rs, binding_addr, &port, scope_id); if (ret) rs->rs_transport = NULL; out: release_sock(sk); return ret; } void rds_bind_lock_destroy(void) { rhashtable_destroy(&bind_hash_table); } int rds_bind_lock_init(void) { return rhashtable_init(&bind_hash_table, &ht_parms); }
2 2 2 2 2 2 2 2 2 420 419 1 413 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 // SPDX-License-Identifier: GPL-2.0-only /* Page fragment allocator * * Page Fragment: * An arbitrary-length arbitrary-offset area of memory which resides within a * 0 or higher order page. Multiple fragments within that page are * individually refcounted, in the page's reference counter. * * The page_frag functions provide a simple allocation framework for page * fragments. This is used by the network stack and network device drivers to * provide a backing region of memory for use as either an sk_buff->head, or to * be used in the "frags" portion of skb_shared_info. */ #include <linux/build_bug.h> #include <linux/export.h> #include <linux/gfp_types.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/page_frag_cache.h> #include "internal.h" static unsigned long encoded_page_create(struct page *page, unsigned int order, bool pfmemalloc) { BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_FRAG_CACHE_ORDER_MASK); BUILD_BUG_ON(PAGE_FRAG_CACHE_PFMEMALLOC_BIT >= PAGE_SIZE); return (unsigned long)page_address(page) | (order & PAGE_FRAG_CACHE_ORDER_MASK) | ((unsigned long)pfmemalloc * PAGE_FRAG_CACHE_PFMEMALLOC_BIT); } static unsigned long encoded_page_decode_order(unsigned long encoded_page) { return encoded_page & PAGE_FRAG_CACHE_ORDER_MASK; } static void *encoded_page_decode_virt(unsigned long encoded_page) { return (void *)(encoded_page & PAGE_MASK); } static struct page *encoded_page_decode_page(unsigned long encoded_page) { return virt_to_page((void *)encoded_page); } static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, gfp_t gfp_mask) { unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER; struct page *page = NULL; gfp_t gfp = gfp_mask; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC; page = __alloc_pages(gfp_mask, PAGE_FRAG_CACHE_MAX_ORDER, numa_mem_id(), NULL); #endif if (unlikely(!page)) { page = __alloc_pages(gfp, 0, numa_mem_id(), NULL); order = 0; } nc->encoded_page = page ? encoded_page_create(page, order, page_is_pfmemalloc(page)) : 0; return page; } void page_frag_cache_drain(struct page_frag_cache *nc) { if (!nc->encoded_page) return; __page_frag_cache_drain(encoded_page_decode_page(nc->encoded_page), nc->pagecnt_bias); nc->encoded_page = 0; } EXPORT_SYMBOL(page_frag_cache_drain); void __page_frag_cache_drain(struct page *page, unsigned int count) { VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); if (page_ref_sub_and_test(page, count)) free_frozen_pages(page, compound_order(page)); } EXPORT_SYMBOL(__page_frag_cache_drain); void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align_mask) { unsigned long encoded_page = nc->encoded_page; unsigned int size, offset; struct page *page; if (unlikely(!encoded_page)) { refill: page = __page_frag_cache_refill(nc, gfp_mask); if (!page) return NULL; encoded_page = nc->encoded_page; /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; nc->offset = 0; } size = PAGE_SIZE << encoded_page_decode_order(encoded_page); offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask); if (unlikely(offset + fragsz > size)) { if (unlikely(fragsz > PAGE_SIZE)) { /* * The caller is trying to allocate a fragment * with fragsz > PAGE_SIZE but the cache isn't big * enough to satisfy the request, this may * happen in low memory conditions. * We don't release the cache page because * it could make memory pressure worse * so we simply return NULL here. */ return NULL; } page = encoded_page_decode_page(encoded_page); if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) goto refill; if (unlikely(encoded_page_decode_pfmemalloc(encoded_page))) { free_frozen_pages(page, encoded_page_decode_order(encoded_page)); goto refill; } /* OK, page count is 0, we can safely set it */ set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; offset = 0; } nc->pagecnt_bias--; nc->offset = offset + fragsz; return encoded_page_decode_virt(encoded_page) + offset; } EXPORT_SYMBOL(__page_frag_alloc_align); /* * Frees a page fragment allocated out of either a compound or order 0 page. */ void page_frag_free(void *addr) { struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) free_frozen_pages(page, compound_order(page)); } EXPORT_SYMBOL(page_frag_free);
6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 2 6 6 6 6 6 18 16 15 14 15 11 15 18 3 3 3 3 3 10 10 10 10 10 15 14 15 20 20 18 20 20 4 4 3 3 3 3 4 4 1 16 18 18 4 4 4 4 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 // SPDX-License-Identifier: GPL-2.0 #include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/dcache.h> #include <linux/path.h> #include <linux/fdtable.h> #include <linux/namei.h> #include <linux/pid.h> #include <linux/ptrace.h> #include <linux/bitmap.h> #include <linux/security.h> #include <linux/file.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/proc_fs.h> #include "../mount.h" #include "internal.h" #include "fd.h" static int seq_show(struct seq_file *m, void *v) { struct files_struct *files = NULL; int f_flags = 0, ret = -ENOENT; struct file *file = NULL; struct task_struct *task; task = get_proc_task(m->private); if (!task) return -ENOENT; task_lock(task); files = task->files; if (files) { unsigned int fd = proc_fd(m->private); spin_lock(&files->file_lock); file = files_lookup_fd_locked(files, fd); if (file) { f_flags = file->f_flags; if (close_on_exec(fd, files)) f_flags |= O_CLOEXEC; get_file(file); ret = 0; } spin_unlock(&files->file_lock); } task_unlock(task); put_task_struct(task); if (ret) return ret; seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n", (long long)file->f_pos, f_flags, real_mount(file->f_path.mnt)->mnt_id, file_inode(file)->i_ino); /* show_fd_locks() never dereferences files, so a stale value is safe */ show_fd_locks(m, file, files); if (seq_has_overflowed(m)) goto out; if (file->f_op->show_fdinfo) file->f_op->show_fdinfo(m, file); out: fput(file); return 0; } static int seq_fdinfo_open(struct inode *inode, struct file *file) { return single_open(file, seq_show, inode); } /* * Shared /proc/pid/fdinfo and /proc/pid/fdinfo/fd permission helper to ensure * that the current task has PTRACE_MODE_READ in addition to the normal * POSIX-like checks. */ static int proc_fdinfo_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { bool allowed = false; struct task_struct *task = get_proc_task(inode); if (!task) return -ESRCH; allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); put_task_struct(task); if (!allowed) return -EACCES; return generic_permission(idmap, inode, mask); } static const struct inode_operations proc_fdinfo_file_inode_operations = { .permission = proc_fdinfo_permission, .setattr = proc_setattr, }; static const struct file_operations proc_fdinfo_file_operations = { .open = seq_fdinfo_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) { struct file *file; file = fget_task(task, fd); if (file) { *mode = file->f_mode; fput(file); } return !!file; } static void tid_fd_update_inode(struct task_struct *task, struct inode *inode, fmode_t f_mode) { task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); if (S_ISLNK(inode->i_mode)) { unsigned i_mode = S_IFLNK; if (f_mode & FMODE_READ) i_mode |= S_IRUSR | S_IXUSR; if (f_mode & FMODE_WRITE) i_mode |= S_IWUSR | S_IXUSR; inode->i_mode = i_mode; } security_task_to_inode(task, inode); } static int tid_fd_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { struct task_struct *task; struct inode *inode; unsigned int fd; if (flags & LOOKUP_RCU) return -ECHILD; inode = d_inode(dentry); task = get_proc_task(inode); fd = proc_fd(inode); if (task) { fmode_t f_mode; if (tid_fd_mode(task, fd, &f_mode)) { tid_fd_update_inode(task, inode, f_mode); put_task_struct(task); return 1; } put_task_struct(task); } return 0; } static const struct dentry_operations tid_fd_dentry_operations = { .d_revalidate = tid_fd_revalidate, .d_delete = pid_delete_dentry, }; static int proc_fd_link(struct dentry *dentry, struct path *path) { struct task_struct *task; int ret = -ENOENT; task = get_proc_task(d_inode(dentry)); if (task) { unsigned int fd = proc_fd(d_inode(dentry)); struct file *fd_file; fd_file = fget_task(task, fd); if (fd_file) { *path = fd_file->f_path; path_get(&fd_file->f_path); ret = 0; fput(fd_file); } put_task_struct(task); } return ret; } struct fd_data { fmode_t mode; unsigned fd; }; static struct dentry *proc_fd_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { const struct fd_data *data = ptr; struct proc_inode *ei; struct inode *inode; inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK); if (!inode) return ERR_PTR(-ENOENT); ei = PROC_I(inode); ei->fd = data->fd; inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; tid_fd_update_inode(task, inode, data->mode); return proc_splice_unmountable(inode, dentry, &tid_fd_dentry_operations); } static struct dentry *proc_lookupfd_common(struct inode *dir, struct dentry *dentry, instantiate_t instantiate) { struct task_struct *task = get_proc_task(dir); struct fd_data data = {.fd = name_to_int(&dentry->d_name)}; struct dentry *result = ERR_PTR(-ENOENT); if (!task) goto out_no_task; if (data.fd == ~0U) goto out; if (!tid_fd_mode(task, data.fd, &data.mode)) goto out; result = instantiate(dentry, task, &data); out: put_task_struct(task); out_no_task: return result; } static int proc_readfd_common(struct file *file, struct dir_context *ctx, instantiate_t instantiate) { struct task_struct *p = get_proc_task(file_inode(file)); unsigned int fd; if (!p) return -ENOENT; if (!dir_emit_dots(file, ctx)) goto out; for (fd = ctx->pos - 2;; fd++) { struct file *f; struct fd_data data; char name[10 + 1]; unsigned int len; f = fget_task_next(p, &fd); ctx->pos = fd + 2LL; if (!f) break; data.mode = f->f_mode; fput(f); data.fd = fd; len = snprintf(name, sizeof(name), "%u", fd); if (!proc_fill_cache(file, ctx, name, len, instantiate, p, &data)) break; cond_resched(); } out: put_task_struct(p); return 0; } static int proc_readfd_count(struct inode *inode, loff_t *count) { struct task_struct *p = get_proc_task(inode); struct fdtable *fdt; if (!p) return -ENOENT; task_lock(p); if (p->files) { rcu_read_lock(); fdt = files_fdtable(p->files); *count = bitmap_weight(fdt->open_fds, fdt->max_fds); rcu_read_unlock(); } task_unlock(p); put_task_struct(p); return 0; } static int proc_fd_iterate(struct file *file, struct dir_context *ctx) { return proc_readfd_common(file, ctx, proc_fd_instantiate); } const struct file_operations proc_fd_operations = { .read = generic_read_dir, .iterate_shared = proc_fd_iterate, .llseek = generic_file_llseek, }; static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); } /* * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). */ int proc_fd_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct task_struct *p; int rv; rv = generic_permission(&nop_mnt_idmap, inode, mask); if (rv == 0) return rv; rcu_read_lock(); p = pid_task(proc_pid(inode), PIDTYPE_PID); if (p && same_thread_group(p, current)) rv = 0; rcu_read_unlock(); return rv; } static int proc_fd_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return proc_readfd_count(inode, &stat->size); } const struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, .permission = proc_fd_permission, .getattr = proc_fd_getattr, .setattr = proc_setattr, }; static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { const struct fd_data *data = ptr; struct proc_inode *ei; struct inode *inode; inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUGO); if (!inode) return ERR_PTR(-ENOENT); ei = PROC_I(inode); ei->fd = data->fd; inode->i_op = &proc_fdinfo_file_inode_operations; inode->i_fop = &proc_fdinfo_file_operations; tid_fd_update_inode(task, inode, 0); return proc_splice_unmountable(inode, dentry, &tid_fd_dentry_operations); } static struct dentry * proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); } static int proc_fdinfo_iterate(struct file *file, struct dir_context *ctx) { return proc_readfd_common(file, ctx, proc_fdinfo_instantiate); } const struct inode_operations proc_fdinfo_inode_operations = { .lookup = proc_lookupfdinfo, .permission = proc_fdinfo_permission, .setattr = proc_setattr, }; const struct file_operations proc_fdinfo_operations = { .read = generic_read_dir, .iterate_shared = proc_fdinfo_iterate, .llseek = generic_file_llseek, };
2 2 1 1 1 5 1 5 5 5 5 4 4 4 4 4 4 4 1 4 4 5 5 5 5 5 24 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/root.c * * Copyright (C) 1991, 1992 Linus Torvalds * * proc root directory handling functions */ #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/stat.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <linux/mount.h> #include <linux/pid_namespace.h> #include <linux/fs_parser.h> #include <linux/cred.h> #include <linux/magic.h> #include <linux/slab.h> #include "internal.h" struct proc_fs_context { struct pid_namespace *pid_ns; unsigned int mask; enum proc_hidepid hidepid; int gid; enum proc_pidonly pidonly; }; enum proc_param { Opt_gid, Opt_hidepid, Opt_subset, Opt_pidns, }; static const struct fs_parameter_spec proc_fs_parameters[] = { fsparam_u32("gid", Opt_gid), fsparam_string("hidepid", Opt_hidepid), fsparam_string("subset", Opt_subset), fsparam_file_or_string("pidns", Opt_pidns), {} }; static inline int valid_hidepid(unsigned int value) { return (value == HIDEPID_OFF || value == HIDEPID_NO_ACCESS || value == HIDEPID_INVISIBLE || value == HIDEPID_NOT_PTRACEABLE); } static int proc_parse_hidepid_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; struct fs_parameter_spec hidepid_u32_spec = fsparam_u32("hidepid", Opt_hidepid); struct fs_parse_result result; int base = (unsigned long)hidepid_u32_spec.data; if (param->type != fs_value_is_string) return invalf(fc, "proc: unexpected type of hidepid value\n"); if (!kstrtouint(param->string, base, &result.uint_32)) { if (!valid_hidepid(result.uint_32)) return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string); ctx->hidepid = result.uint_32; return 0; } if (!strcmp(param->string, "off")) ctx->hidepid = HIDEPID_OFF; else if (!strcmp(param->string, "noaccess")) ctx->hidepid = HIDEPID_NO_ACCESS; else if (!strcmp(param->string, "invisible")) ctx->hidepid = HIDEPID_INVISIBLE; else if (!strcmp(param->string, "ptraceable")) ctx->hidepid = HIDEPID_NOT_PTRACEABLE; else return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string); return 0; } static int proc_parse_subset_param(struct fs_context *fc, char *value) { struct proc_fs_context *ctx = fc->fs_private; while (value) { char *ptr = strchr(value, ','); if (ptr != NULL) *ptr++ = '\0'; if (*value != '\0') { if (!strcmp(value, "pid")) { ctx->pidonly = PROC_PIDONLY_ON; } else { return invalf(fc, "proc: unsupported subset option - %s\n", value); } } value = ptr; } return 0; } #ifdef CONFIG_PID_NS static int proc_parse_pidns_param(struct fs_context *fc, struct fs_parameter *param, struct fs_parse_result *result) { struct proc_fs_context *ctx = fc->fs_private; struct pid_namespace *target, *active = task_active_pid_ns(current); struct ns_common *ns; struct file *ns_filp __free(fput) = NULL; switch (param->type) { case fs_value_is_file: /* came through fsconfig, steal the file reference */ ns_filp = no_free_ptr(param->file); break; case fs_value_is_string: ns_filp = filp_open(param->string, O_RDONLY, 0); break; default: WARN_ON_ONCE(true); break; } if (!ns_filp) ns_filp = ERR_PTR(-EBADF); if (IS_ERR(ns_filp)) { errorfc(fc, "could not get file from pidns argument"); return PTR_ERR(ns_filp); } if (!proc_ns_file(ns_filp)) return invalfc(fc, "pidns argument is not an nsfs file"); ns = get_proc_ns(file_inode(ns_filp)); if (ns->ns_type != CLONE_NEWPID) return invalfc(fc, "pidns argument is not a pidns file"); target = container_of(ns, struct pid_namespace, ns); /* * pidns= is shorthand for joining the pidns to get a fsopen fd, so the * permission model should be the same as pidns_install(). */ if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) { errorfc(fc, "insufficient permissions to set pidns"); return -EPERM; } if (!pidns_is_ancestor(target, active)) return invalfc(fc, "cannot set pidns to non-descendant pidns"); put_pid_ns(ctx->pid_ns); ctx->pid_ns = get_pid_ns(target); put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); return 0; } #endif /* CONFIG_PID_NS */ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; struct fs_parse_result result; int opt, err; opt = fs_parse(fc, proc_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_gid: ctx->gid = result.uint_32; break; case Opt_hidepid: err = proc_parse_hidepid_param(fc, param); if (err) return err; break; case Opt_subset: err = proc_parse_subset_param(fc, param->string); if (err) return err; break; case Opt_pidns: #ifdef CONFIG_PID_NS /* * We would have to RCU-protect every proc_pid_ns() or * proc_sb_info() access if we allowed this to be reconfigured * for an existing procfs instance. Luckily, procfs instances * are cheap to create, and mount-beneath would let you * atomically replace an instance even with overmounts. */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { errorfc(fc, "cannot reconfigure pidns for existing procfs"); return -EBUSY; } err = proc_parse_pidns_param(fc, param, &result); if (err) return err; break; #else errorfc(fc, "pidns mount flag not supported on this system"); return -EOPNOTSUPP; #endif default: return -EINVAL; } ctx->mask |= 1 << opt; return 0; } static void proc_apply_options(struct proc_fs_info *fs_info, struct fs_context *fc, struct user_namespace *user_ns) { struct proc_fs_context *ctx = fc->fs_private; if (ctx->mask & (1 << Opt_gid)) fs_info->pid_gid = make_kgid(user_ns, ctx->gid); if (ctx->mask & (1 << Opt_hidepid)) fs_info->hide_pid = ctx->hidepid; if (ctx->mask & (1 << Opt_subset)) fs_info->pidonly = ctx->pidonly; if (ctx->mask & (1 << Opt_pidns) && !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) { put_pid_ns(fs_info->pid_ns); fs_info->pid_ns = get_pid_ns(ctx->pid_ns); } } static int proc_fill_super(struct super_block *s, struct fs_context *fc) { struct proc_fs_context *ctx = fc->fs_private; struct inode *root_inode; struct proc_fs_info *fs_info; int ret; fs_info = kzalloc(sizeof(*fs_info), GFP_KERNEL); if (!fs_info) return -ENOMEM; fs_info->pid_ns = get_pid_ns(ctx->pid_ns); proc_apply_options(fs_info, fc, current_user_ns()); /* User space would break if executables or devices appear on proc */ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = PROC_SUPER_MAGIC; s->s_op = &proc_sops; s->s_time_gran = 1; s->s_fs_info = fs_info; /* * procfs isn't actually a stacking filesystem; however, there is * too much magic going on inside it to permit stacking things on * top of it */ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; /* procfs dentries and inodes don't require IO to create */ s->s_shrink->seeks = 0; pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); if (!root_inode) { pr_err("proc_fill_super: get root inode failed\n"); return -ENOMEM; } s->s_root = d_make_root(root_inode); if (!s->s_root) { pr_err("proc_fill_super: allocate dentry failed\n"); return -ENOMEM; } ret = proc_setup_self(s); if (ret) { return ret; } return proc_setup_thread_self(s); } static int proc_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct proc_fs_info *fs_info = proc_sb_info(sb); sync_filesystem(sb); proc_apply_options(fs_info, fc, current_user_ns()); return 0; } static int proc_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, proc_fill_super); } static void proc_fs_context_free(struct fs_context *fc) { struct proc_fs_context *ctx = fc->fs_private; put_pid_ns(ctx->pid_ns); kfree(ctx); } static const struct fs_context_operations proc_fs_context_ops = { .free = proc_fs_context_free, .parse_param = proc_parse_param, .get_tree = proc_get_tree, .reconfigure = proc_reconfigure, }; static int proc_init_fs_context(struct fs_context *fc) { struct proc_fs_context *ctx; ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->pid_ns = get_pid_ns(task_active_pid_ns(current)); put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); fc->fs_private = ctx; fc->ops = &proc_fs_context_ops; return 0; } static void proc_kill_sb(struct super_block *sb) { struct proc_fs_info *fs_info = proc_sb_info(sb); kill_anon_super(sb); if (fs_info) { put_pid_ns(fs_info->pid_ns); kfree_rcu(fs_info, rcu); } } static struct file_system_type proc_fs_type = { .name = "proc", .init_fs_context = proc_init_fs_context, .parameters = proc_fs_parameters, .kill_sb = proc_kill_sb, .fs_flags = FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM, }; void __init proc_root_init(void) { proc_init_kmemcache(); set_proc_pid_nlink(); proc_self_init(); proc_thread_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); proc_mkdir("fs", NULL); proc_mkdir("driver", NULL); proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */ #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) /* just give it a mountpoint */ proc_create_mount_point("openprom"); #endif proc_tty_init(); proc_mkdir("bus", NULL); proc_sys_init(); /* * Last things last. It is not like userspace processes eager * to open /proc files exist at this point but register last * anyway. */ register_filesystem(&proc_fs_type); } static int proc_root_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { if (!proc_pid_lookup(dentry, flags)) return NULL; return proc_lookup(dir, dentry, flags); } static int proc_root_readdir(struct file *file, struct dir_context *ctx) { if (ctx->pos < FIRST_PROCESS_ENTRY) { int error = proc_readdir(file, ctx); if (unlikely(error <= 0)) return error; ctx->pos = FIRST_PROCESS_ENTRY; } return proc_pid_readdir(file, ctx); } /* * The root /proc directory is special, as it has the * <pid> directories. Thus we don't use the generic * directory handling functions for that.. */ static const struct file_operations proc_root_operations = { .read = generic_read_dir, .iterate_shared = proc_root_readdir, .llseek = generic_file_llseek, }; /* * proc root can do almost nothing.. */ static const struct inode_operations proc_root_inode_operations = { .lookup = proc_root_lookup, .getattr = proc_root_getattr, }; /* * This is the root "inode" in the /proc tree.. */ struct proc_dir_entry proc_root = { .low_ino = PROCFS_ROOT_INO, .namelen = 5, .mode = S_IFDIR | S_IRUGO | S_IXUGO, .nlink = 2, .refcnt = REFCOUNT_INIT(1), .proc_iops = &proc_root_inode_operations, .proc_dir_ops = &proc_root_operations, .parent = &proc_root, .subdir = RB_ROOT, .name = "/proc", };
6 6 6 28 29 21 12 4 2 2 1 2 2 3 1 1 2 28 17 17 37 37 13 29 13 25 12 12 4 25 13 12 12 12 2 12 25 18 15 5 5 5 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 IBM Corporation * * Author: * Mimi Zohar <zohar@us.ibm.com> */ #include <linux/module.h> #include <linux/init.h> #include <linux/file.h> #include <linux/binfmts.h> #include <linux/fs.h> #include <linux/xattr.h> #include <linux/magic.h> #include <linux/ima.h> #include <linux/evm.h> #include <linux/fsverity.h> #include <keys/system_keyring.h> #include <uapi/linux/fsverity.h> #include "ima.h" #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM static char *ima_appraise_cmdline_default __initdata; core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0); void __init ima_appraise_parse_cmdline(void) { const char *str = ima_appraise_cmdline_default; bool sb_state = arch_ima_get_secureboot(); int appraisal_state = ima_appraise; if (!str) return; if (strncmp(str, "off", 3) == 0) appraisal_state = 0; else if (strncmp(str, "log", 3) == 0) appraisal_state = IMA_APPRAISE_LOG; else if (strncmp(str, "fix", 3) == 0) appraisal_state = IMA_APPRAISE_FIX; else if (strncmp(str, "enforce", 7) == 0) appraisal_state = IMA_APPRAISE_ENFORCE; else pr_err("invalid \"%s\" appraise option", str); /* If appraisal state was changed, but secure boot is enabled, * keep its default */ if (sb_state) { if (!(appraisal_state & IMA_APPRAISE_ENFORCE)) pr_info("Secure boot enabled: ignoring ima_appraise=%s option", str); } else { ima_appraise = appraisal_state; } } #endif /* * is_ima_appraise_enabled - return appraise status * * Only return enabled, if not in ima_appraise="fix" or "log" modes. */ bool is_ima_appraise_enabled(void) { return ima_appraise & IMA_APPRAISE_ENFORCE; } /* * ima_must_appraise - set appraise flag * * Return 1 to appraise or hash */ int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { struct lsm_prop prop; if (!ima_appraise) return 0; security_current_getlsmprop_subj(&prop); return ima_match_policy(idmap, inode, current_cred(), &prop, func, mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL, NULL); } static int ima_fix_xattr(struct dentry *dentry, struct ima_iint_cache *iint) { int rc, offset; u8 algo = iint->ima_hash->algo; if (algo <= HASH_ALGO_SHA1) { offset = 1; iint->ima_hash->xattr.sha1.type = IMA_XATTR_DIGEST; } else { offset = 0; iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG; iint->ima_hash->xattr.ng.algo = algo; } rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, &iint->ima_hash->xattr.data[offset], (sizeof(iint->ima_hash->xattr) - offset) + iint->ima_hash->length, 0); return rc; } /* Return specific func appraised cached result */ enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: return iint->ima_mmap_status; case BPRM_CHECK: return iint->ima_bprm_status; case CREDS_CHECK: return iint->ima_creds_status; case FILE_CHECK: case POST_SETATTR: return iint->ima_file_status; case MODULE_CHECK ... MAX_CHECK - 1: default: return iint->ima_read_status; } } static void ima_set_cache_status(struct ima_iint_cache *iint, enum ima_hooks func, enum integrity_status status) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->ima_mmap_status = status; break; case BPRM_CHECK: iint->ima_bprm_status = status; break; case CREDS_CHECK: iint->ima_creds_status = status; break; case FILE_CHECK: case POST_SETATTR: iint->ima_file_status = status; break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->ima_read_status = status; break; } } static void ima_cache_flags(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->flags |= (IMA_MMAP_APPRAISED | IMA_APPRAISED); break; case BPRM_CHECK: iint->flags |= (IMA_BPRM_APPRAISED | IMA_APPRAISED); break; case CREDS_CHECK: iint->flags |= (IMA_CREDS_APPRAISED | IMA_APPRAISED); break; case FILE_CHECK: case POST_SETATTR: iint->flags |= (IMA_FILE_APPRAISED | IMA_APPRAISED); break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->flags |= (IMA_READ_APPRAISED | IMA_APPRAISED); break; } } enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value, int xattr_len) { struct signature_v2_hdr *sig; enum hash_algo ret; if (!xattr_value || xattr_len < 2) /* return default hash algo */ return ima_hash_algo; switch (xattr_value->type) { case IMA_VERITY_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 3 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case EVM_IMA_XATTR_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 2 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ ret = xattr_value->data[0]; if (ret < HASH_ALGO__LAST) return ret; break; case IMA_XATTR_DIGEST: /* this is for backward compatibility */ if (xattr_len == 21) { unsigned int zero = 0; if (!memcmp(&xattr_value->data[16], &zero, 4)) return HASH_ALGO_MD5; else return HASH_ALGO_SHA1; } else if (xattr_len == 17) return HASH_ALGO_MD5; break; } /* return default hash algo */ return ima_hash_algo; } int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len) { int ret; ret = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, (char **)xattr_value, xattr_len, GFP_NOFS); if (ret == -EOPNOTSUPP) ret = 0; return ret; } /* * calc_file_id_hash - calculate the hash of the ima_file_id struct data * @type: xattr type [enum evm_ima_xattr_type] * @algo: hash algorithm [enum hash_algo] * @digest: pointer to the digest to be hashed * @hash: (out) pointer to the hash * * IMA signature version 3 disambiguates the data that is signed by * indirectly signing the hash of the ima_file_id structure data. * * Signing the ima_file_id struct is currently only supported for * IMA_VERITY_DIGSIG type xattrs. * * Return 0 on success, error code otherwise. */ static int calc_file_id_hash(enum evm_ima_xattr_type type, enum hash_algo algo, const u8 *digest, struct ima_digest_data *hash) { struct ima_file_id file_id = { .hash_type = IMA_VERITY_DIGSIG, .hash_algorithm = algo}; unsigned int unused = HASH_MAX_DIGESTSIZE - hash_digest_size[algo]; if (type != IMA_VERITY_DIGSIG) return -EINVAL; memcpy(file_id.hash, digest, hash_digest_size[algo]); hash->algo = algo; hash->length = hash_digest_size[algo]; return ima_calc_buffer_hash(&file_id, sizeof(file_id) - unused, hash); } /* * xattr_verify - verify xattr digest or signature * * Verify whether the hash or signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int xattr_verify(enum ima_hooks func, struct ima_iint_cache *iint, struct evm_ima_xattr_data *xattr_value, int xattr_len, enum integrity_status *status, const char **cause) { struct ima_max_digest_data hash; struct signature_v2_hdr *sig; int rc = -EINVAL, hash_start = 0; int mask; switch (xattr_value->type) { case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ hash_start = 1; fallthrough; case IMA_XATTR_DIGEST: if (*status != INTEGRITY_PASS_IMMUTABLE) { if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) *cause = "verity-signature-required"; else *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } clear_bit(IMA_DIGSIG, &iint->atomic_flags); } else { set_bit(IMA_DIGSIG, &iint->atomic_flags); } if (xattr_len - sizeof(xattr_value->type) - hash_start >= iint->ima_hash->length) /* * xattr length may be longer. md5 hash in previous * version occupied 20 bytes in xattr, instead of 16 */ rc = memcmp(&xattr_value->data[hash_start], iint->ima_hash->digest, iint->ima_hash->length); else rc = -EINVAL; if (rc) { *cause = "invalid-hash"; *status = INTEGRITY_FAIL; break; } *status = INTEGRITY_PASS; break; case EVM_IMA_XATTR_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); mask = IMA_DIGSIG_REQUIRED | IMA_VERITY_REQUIRED; if ((iint->flags & mask) == mask) { *cause = "verity-signature-required"; *status = INTEGRITY_FAIL; break; } sig = (typeof(sig))xattr_value; if (sig->version >= 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc == -EOPNOTSUPP) { *status = INTEGRITY_UNKNOWN; break; } if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; case IMA_VERITY_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); if (iint->flags & IMA_DIGSIG_REQUIRED) { if (!(iint->flags & IMA_VERITY_REQUIRED)) { *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } } sig = (typeof(sig))xattr_value; if (sig->version != 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = calc_file_id_hash(IMA_VERITY_DIGSIG, iint->ima_hash->algo, iint->ima_hash->digest, container_of(&hash.hdr, struct ima_digest_data, hdr)); if (rc) { *cause = "sigv3-hashing-error"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, hash.digest, hash.hdr.length); if (rc) { *cause = "invalid-verity-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; default: *status = INTEGRITY_UNKNOWN; *cause = "unknown-ima-data"; break; } return rc; } /* * modsig_verify - verify modsig signature * * Verify whether the signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int modsig_verify(enum ima_hooks func, const struct modsig *modsig, enum integrity_status *status, const char **cause) { int rc; rc = integrity_modsig_verify(INTEGRITY_KEYRING_IMA, modsig); if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_modsig_verify(INTEGRITY_KEYRING_PLATFORM, modsig); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } return rc; } /* * ima_check_blacklist - determine if the binary is blacklisted. * * Add the hash of the blacklisted binary to the measurement list, based * on policy. * * Returns -EPERM if the hash is blacklisted. */ int ima_check_blacklist(struct ima_iint_cache *iint, const struct modsig *modsig, int pcr) { enum hash_algo hash_algo; const u8 *digest = NULL; u32 digestsize = 0; int rc = 0; if (!(iint->flags & IMA_CHECK_BLACKLIST)) return 0; if (iint->flags & IMA_MODSIG_ALLOWED && modsig) { ima_get_modsig_digest(modsig, &hash_algo, &digest, &digestsize); rc = is_binary_blacklisted(digest, digestsize); } else if (iint->flags & IMA_DIGSIG_REQUIRED && iint->ima_hash) rc = is_binary_blacklisted(iint->ima_hash->digest, iint->ima_hash->length); if ((rc == -EPERM) && (iint->flags & IMA_MEASURE)) process_buffer_measurement(&nop_mnt_idmap, NULL, digest, digestsize, "blacklisted-hash", NONE, pcr, NULL, false, NULL, 0); return rc; } static bool is_bprm_creds_for_exec(enum ima_hooks func, struct file *file) { struct linux_binprm *bprm; if (func == BPRM_CHECK) { bprm = container_of(&file, struct linux_binprm, file); return bprm->is_check; } return false; } /* * ima_appraise_measurement - appraise file measurement * * Call evm_verifyxattr() to verify the integrity of 'security.ima'. * Assuming success, compare the xattr hash with the collected measurement. * * Return 0 on success, error code otherwise */ int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig) { static const char op[] = "appraise_data"; int audit_msgno = AUDIT_INTEGRITY_DATA; const char *cause = "unknown"; struct dentry *dentry = file_dentry(file); struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; int rc = xattr_len; bool try_modsig = iint->flags & IMA_MODSIG_ALLOWED && modsig; /* If not appraising a modsig, we need an xattr. */ if (!(inode->i_opflags & IOP_XATTR) && !try_modsig) return INTEGRITY_UNKNOWN; /* * Unlike any of the other LSM hooks where the kernel enforces file * integrity, enforcing file integrity for the bprm_creds_for_exec() * LSM hook with the AT_EXECVE_CHECK flag is left up to the discretion * of the script interpreter(userspace). Differentiate kernel and * userspace enforced integrity audit messages. */ if (is_bprm_creds_for_exec(func, file)) audit_msgno = AUDIT_INTEGRITY_USERSPACE; /* If reading the xattr failed and there's no modsig, error out. */ if (rc <= 0 && !try_modsig) { if (rc && rc != -ENODATA) goto out; if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) cause = "verity-signature-required"; else cause = "IMA-signature-required"; } else { cause = "missing-hash"; } status = INTEGRITY_NOLABEL; if (file->f_mode & FMODE_CREATED) iint->flags |= IMA_NEW_FILE; if ((iint->flags & IMA_NEW_FILE) && (!(iint->flags & IMA_DIGSIG_REQUIRED) || (inode->i_size == 0))) status = INTEGRITY_PASS; goto out; } status = evm_verifyxattr(dentry, XATTR_NAME_IMA, xattr_value, rc < 0 ? 0 : rc); switch (status) { case INTEGRITY_PASS: case INTEGRITY_PASS_IMMUTABLE: case INTEGRITY_UNKNOWN: break; case INTEGRITY_NOXATTRS: /* No EVM protected xattrs. */ /* It's fine not to have xattrs when using a modsig. */ if (try_modsig) break; fallthrough; case INTEGRITY_NOLABEL: /* No security.evm xattr. */ cause = "missing-HMAC"; goto out; case INTEGRITY_FAIL_IMMUTABLE: set_bit(IMA_DIGSIG, &iint->atomic_flags); cause = "invalid-fail-immutable"; goto out; case INTEGRITY_FAIL: /* Invalid HMAC/signature. */ cause = "invalid-HMAC"; goto out; default: WARN_ONCE(true, "Unexpected integrity status %d\n", status); } if (xattr_value) rc = xattr_verify(func, iint, xattr_value, xattr_len, &status, &cause); /* * If we have a modsig and either no imasig or the imasig's key isn't * known, then try verifying the modsig. */ if (try_modsig && (!xattr_value || xattr_value->type == IMA_XATTR_DIGEST_NG || rc == -ENOKEY)) rc = modsig_verify(func, modsig, &status, &cause); out: /* * File signatures on some filesystems can not be properly verified. * When such filesystems are mounted by an untrusted mounter or on a * system not willing to accept such a risk, fail the file signature * verification. */ if ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) && ((inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) || (iint->flags & IMA_FAIL_UNVERIFIABLE_SIGS))) { status = INTEGRITY_FAIL; cause = "unverifiable-signature"; integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else if (status != INTEGRITY_PASS) { /* Fix mode, but don't replace file signatures. */ if ((ima_appraise & IMA_APPRAISE_FIX) && !try_modsig && (!xattr_value || xattr_value->type != EVM_IMA_XATTR_DIGSIG)) { if (!ima_fix_xattr(dentry, iint)) status = INTEGRITY_PASS; } /* * Permit new files with file/EVM portable signatures, but * without data. */ if (inode->i_size == 0 && iint->flags & IMA_NEW_FILE && test_bit(IMA_DIGSIG, &iint->atomic_flags)) { status = INTEGRITY_PASS; } integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else { ima_cache_flags(iint, func); } ima_set_cache_status(iint, func, status); return status; } /* * ima_update_xattr - update 'security.ima' hash value */ void ima_update_xattr(struct ima_iint_cache *iint, struct file *file) { struct dentry *dentry = file_dentry(file); int rc = 0; /* do not collect and update hash for digital signatures */ if (test_bit(IMA_DIGSIG, &iint->atomic_flags)) return; if ((iint->ima_file_status != INTEGRITY_PASS) && !(iint->flags & IMA_HASH)) return; rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo, NULL); if (rc < 0) return; inode_lock(file_inode(file)); ima_fix_xattr(dentry, iint); inode_unlock(file_inode(file)); } /** * ima_inode_post_setattr - reflect file metadata changes * @idmap: idmap of the mount the inode was found from * @dentry: pointer to the affected dentry * @ia_valid: for the UID and GID status * * Changes to a dentry's metadata might result in needing to appraise. * * This function is called from notify_change(), which expects the caller * to lock the inode's i_mutex. */ static void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid) { struct inode *inode = d_backing_inode(dentry); struct ima_iint_cache *iint; int action; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode) || !(inode->i_opflags & IOP_XATTR)) return; action = ima_must_appraise(idmap, inode, MAY_ACCESS, POST_SETATTR); iint = ima_iint_find(inode); if (iint) { set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags); if (!action) clear_bit(IMA_UPDATE_XATTR, &iint->atomic_flags); } } /* * ima_protect_xattr - protect 'security.ima' * * Ensure that not just anyone can modify or remove 'security.ima'. */ static int ima_protect_xattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { if (strcmp(xattr_name, XATTR_NAME_IMA) == 0) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return 1; } return 0; } /* * ima_reset_appraise_flags - reset ima_iint_cache flags * * @digsig: whether to clear/set IMA_DIGSIG flag, tristate values * 0: clear IMA_DIGSIG * 1: set IMA_DIGSIG * -1: don't change IMA_DIGSIG * */ static void ima_reset_appraise_flags(struct inode *inode, int digsig) { struct ima_iint_cache *iint; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode)) return; iint = ima_iint_find(inode); if (!iint) return; iint->measured_pcrs = 0; set_bit(IMA_CHANGE_XATTR, &iint->atomic_flags); if (digsig == 1) set_bit(IMA_DIGSIG, &iint->atomic_flags); else if (digsig == 0) clear_bit(IMA_DIGSIG, &iint->atomic_flags); } /** * validate_hash_algo() - Block setxattr with unsupported hash algorithms * @dentry: object of the setxattr() * @xattr_value: userland supplied xattr value * @xattr_value_len: length of xattr_value * * The xattr value is mapped to its hash algorithm, and this algorithm * must be built in the kernel for the setxattr to be allowed. * * Emit an audit message when the algorithm is invalid. * * Return: 0 on success, else an error. */ static int validate_hash_algo(struct dentry *dentry, const struct evm_ima_xattr_data *xattr_value, size_t xattr_value_len) { char *path = NULL, *pathbuf = NULL; enum hash_algo xattr_hash_algo; const char *errmsg = "unavailable-hash-algorithm"; unsigned int allowed_hashes; xattr_hash_algo = ima_get_hash_algo(xattr_value, xattr_value_len); allowed_hashes = atomic_read(&ima_setxattr_allowed_hash_algorithms); if (allowed_hashes) { /* success if the algorithm is allowed in the ima policy */ if (allowed_hashes & (1U << xattr_hash_algo)) return 0; /* * We use a different audit message when the hash algorithm * is denied by a policy rule, instead of not being built * in the kernel image */ errmsg = "denied-hash-algorithm"; } else { if (likely(xattr_hash_algo == ima_hash_algo)) return 0; /* allow any xattr using an algorithm built in the kernel */ if (crypto_has_alg(hash_algo_name[xattr_hash_algo], 0, 0)) return 0; } pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) return -EACCES; path = dentry_path(dentry, pathbuf, PATH_MAX); integrity_audit_msg(AUDIT_INTEGRITY_DATA, d_inode(dentry), path, "set_data", errmsg, -EACCES, 0); kfree(pathbuf); return -EACCES; } static int ima_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len, int flags) { const struct evm_ima_xattr_data *xvalue = xattr_value; int digsig = 0; int result; int err; result = ima_protect_xattr(dentry, xattr_name, xattr_value, xattr_value_len); if (result == 1) { if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST)) return -EINVAL; err = validate_hash_algo(dentry, xvalue, xattr_value_len); if (err) return err; digsig = (xvalue->type == EVM_IMA_XATTR_DIGSIG); } else if (!strcmp(xattr_name, XATTR_NAME_EVM) && xattr_value_len > 0) { digsig = (xvalue->type == EVM_XATTR_PORTABLE_DIGSIG); } else { digsig = -1; } if (result == 1 || evm_revalidate_status(xattr_name)) { ima_reset_appraise_flags(d_backing_inode(dentry), digsig); if (result == 1) result = 0; } return result; } static int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (evm_revalidate_status(acl_name)) ima_reset_appraise_flags(d_backing_inode(dentry), -1); return 0; } static int ima_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { int result, digsig = -1; result = ima_protect_xattr(dentry, xattr_name, NULL, 0); if (result == 1 || evm_revalidate_status(xattr_name)) { if (!strcmp(xattr_name, XATTR_NAME_IMA)) digsig = 0; ima_reset_appraise_flags(d_backing_inode(dentry), digsig); if (result == 1) result = 0; } return result; } static int ima_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ima_inode_set_acl(idmap, dentry, acl_name, NULL); } static struct security_hook_list ima_appraise_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_post_setattr, ima_inode_post_setattr), LSM_HOOK_INIT(inode_setxattr, ima_inode_setxattr), LSM_HOOK_INIT(inode_set_acl, ima_inode_set_acl), LSM_HOOK_INIT(inode_removexattr, ima_inode_removexattr), LSM_HOOK_INIT(inode_remove_acl, ima_inode_remove_acl), }; void __init init_ima_appraise_lsm(const struct lsm_id *lsmid) { security_add_hooks(ima_appraise_hooks, ARRAY_SIZE(ima_appraise_hooks), lsmid); }
8 7 8 8 8 8 8 8 8 7 7 8 1 9 9 9 8 9 9 9 9 8 8 8 8 1 1 1 1 8 1 1 1 6 6 6 8 8 7 8 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 7 8 8 8 8 8 8 8 8 7 8 8 7 8 7 8 8 7 8 8 8 8 8 8 8 8 8 8 7 7 8 8 8 8 7 22 21 22 22 21 21 8 8 8 7 7 8 8 8 8 8 8 8 5 5 1 1 5 1 4 4 4 4 4 4 4 4 4 12 12 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Swap reorganised 29.12.95, Stephen Tweedie. * kswapd added: 7.1.96 sct * Removed kswapd_ctl limits, and swap out as many pages as needed * to bring the system back to freepages.high: 2.4.97, Rik van Riel. * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). * Multiqueue VM started 5.8.00, Rik van Riel. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/module.h> #include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/highmem.h> #include <linux/vmpressure.h> #include <linux/vmstat.h> #include <linux/file.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> /* for buffer_heads_over_limit */ #include <linux/mm_inline.h> #include <linux/backing-dev.h> #include <linux/rmap.h> #include <linux/topology.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/compaction.h> #include <linux/notifier.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/memcontrol.h> #include <linux/migrate.h> #include <linux/delayacct.h> #include <linux/sysctl.h> #include <linux/memory-tiers.h> #include <linux/oom.h> #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/printk.h> #include <linux/dax.h> #include <linux/psi.h> #include <linux/pagewalk.h> #include <linux/shmem_fs.h> #include <linux/ctype.h> #include <linux/debugfs.h> #include <linux/khugepaged.h> #include <linux/rculist_nulls.h> #include <linux/random.h> #include <linux/mmu_notifier.h> #include <linux/parser.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include <linux/swapops.h> #include <linux/balloon_compaction.h> #include <linux/sched/sysctl.h> #include "internal.h" #include "swap.h" #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h> struct scan_control { /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes * are scanned. */ nodemask_t *nodemask; /* * The memory cgroup that hit its limit and as a result is the * primary target of this reclaim invocation. */ struct mem_cgroup *target_mem_cgroup; /* * Scan pressure balancing between anon and file LRUs */ unsigned long anon_cost; unsigned long file_cost; /* Swappiness value for proactive reclaim. Always use sc_swappiness()! */ int *proactive_swappiness; /* Can active folios be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 unsigned int may_deactivate:2; unsigned int force_deactivate:1; unsigned int skipped_deactivate:1; /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; /* Can mapped folios be reclaimed? */ unsigned int may_unmap:1; /* Can folios be swapped as part of reclaim? */ unsigned int may_swap:1; /* Not allow cache_trim_mode to be turned on as part of reclaim? */ unsigned int no_cache_trim_mode:1; /* Has cache_trim_mode failed at least once? */ unsigned int cache_trim_mode_failed:1; /* Proactive reclaim invoked by userspace */ unsigned int proactive:1; /* * Cgroup memory below memory.low is protected as long as we * don't threaten to OOM. If any cgroup is reclaimed at * reduced force or passed over entirely due to its memory.low * setting (memcg_low_skipped), and nothing is reclaimed as a * result, then go back for one more cycle that reclaims the protected * memory (memcg_low_reclaim) to avert OOM. */ unsigned int memcg_low_reclaim:1; unsigned int memcg_low_skipped:1; /* Shared cgroup tree walk failed, rescan the whole tree */ unsigned int memcg_full_walk:1; unsigned int hibernation_mode:1; /* One of the zones is ready for compaction */ unsigned int compaction_ready:1; /* There is easily reclaimable cold cache in the current node */ unsigned int cache_trim_mode:1; /* The file folios on the current node are dangerously low */ unsigned int file_is_tiny:1; /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; /* Allocation order */ s8 order; /* Scan (total_size >> priority) pages at once */ s8 priority; /* The highest zone to isolate folios for reclaim from */ s8 reclaim_idx; /* This context's GFP mask */ gfp_t gfp_mask; /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; struct { unsigned int dirty; unsigned int unqueued_dirty; unsigned int congested; unsigned int writeback; unsigned int immediate; unsigned int file_taken; unsigned int taken; } nr; /* for recording the reclaimed slab by now */ struct reclaim_state reclaim_state; }; #ifdef ARCH_HAS_PREFETCHW #define prefetchw_prev_lru_folio(_folio, _base, _field) \ do { \ if ((_folio)->lru.prev != _base) { \ struct folio *prev; \ \ prev = lru_to_folio(&(_folio->lru)); \ prefetchw(&prev->_field); \ } \ } while (0) #else #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) #endif /* * From 0 .. MAX_SWAPPINESS. Higher means more swappy. */ int vm_swappiness = 60; #ifdef CONFIG_MEMCG /* Returns true for reclaim through cgroup limits or cgroup interfaces. */ static bool cgroup_reclaim(struct scan_control *sc) { return sc->target_mem_cgroup; } /* * Returns true for reclaim on the root cgroup. This is true for direct * allocator reclaim and reclaim through cgroup interfaces on the root cgroup. */ static bool root_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); } /** * writeback_throttling_sane - is the usual dirty throttling mechanism available? * @sc: scan_control in question * * The normal page dirty throttling mechanism in balance_dirty_pages() is * completely broken with the legacy memcg and direct stalling in * shrink_folio_list() is used for throttling instead, which lacks all the * niceties such as fairness, adaptive pausing, bandwidth proportional * allocation and configurability. * * This function tests whether the vmscan currently in progress can assume * that the normal dirty throttling mechanism is operational. */ static bool writeback_throttling_sane(struct scan_control *sc) { if (!cgroup_reclaim(sc)) return true; #ifdef CONFIG_CGROUP_WRITEBACK if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return true; #endif return false; } static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) { if (sc->proactive && sc->proactive_swappiness) return *sc->proactive_swappiness; return mem_cgroup_swappiness(memcg); } #else static bool cgroup_reclaim(struct scan_control *sc) { return false; } static bool root_reclaim(struct scan_control *sc) { return true; } static bool writeback_throttling_sane(struct scan_control *sc) { return true; } static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) { return READ_ONCE(vm_swappiness); } #endif /* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to * and including the specified highidx * @zone: The current zone in the iterator * @pgdat: The pgdat which node_zones are being iterated * @idx: The index variable * @highidx: The index of the highest zone to return * * This macro iterates through all managed zones up to and including the specified highidx. * The zone iterator enters an invalid state after macro call and must be reinitialized * before it can be used again. */ #define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ for ((idx) = 0, (zone) = (pgdat)->node_zones; \ (idx) <= (highidx); \ (idx)++, (zone)++) \ if (!managed_zone(zone)) \ continue; \ else static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) { /* Check for an overwrite */ WARN_ON_ONCE(rs && task->reclaim_state); /* Check for the nulling of an already-nulled member */ WARN_ON_ONCE(!rs && !task->reclaim_state); task->reclaim_state = rs; } /* * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to * scan_control->nr_reclaimed. */ static void flush_reclaim_state(struct scan_control *sc) { /* * Currently, reclaim_state->reclaimed includes three types of pages * freed outside of vmscan: * (1) Slab pages. * (2) Clean file pages from pruned inodes (on highmem systems). * (3) XFS freed buffer pages. * * For all of these cases, we cannot universally link the pages to a * single memcg. For example, a memcg-aware shrinker can free one object * charged to the target memcg, causing an entire page to be freed. * If we count the entire page as reclaimed from the memcg, we end up * overestimating the reclaimed amount (potentially under-reclaiming). * * Only count such pages for global reclaim to prevent under-reclaiming * from the target memcg; preventing unnecessary retries during memcg * charging and false positives from proactive reclaim. * * For uncommon cases where the freed pages were actually mostly * charged to the target memcg, we end up underestimating the reclaimed * amount. This should be fine. The freed pages will be uncharged * anyway, even if they are not counted here properly, and we will be * able to make forward progress in charging (which is usually in a * retry loop). * * We can go one step further, and report the uncharged objcg pages in * memcg reclaim, to make reporting more accurate and reduce * underestimation, but it's probably not worth the complexity for now. */ if (current->reclaim_state && root_reclaim(sc)) { sc->nr_reclaimed += current->reclaim_state->reclaimed; current->reclaim_state->reclaimed = 0; } } static bool can_demote(int nid, struct scan_control *sc, struct mem_cgroup *memcg) { int demotion_nid; if (!numa_demotion_enabled) return false; if (sc && sc->no_demotion) return false; demotion_nid = next_demotion_node(nid); if (demotion_nid == NUMA_NO_NODE) return false; /* If demotion node isn't in the cgroup's mems_allowed, fall back */ return mem_cgroup_node_allowed(memcg, demotion_nid); } static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, int nid, struct scan_control *sc) { if (memcg == NULL) { /* * For non-memcg reclaim, is there * space in any swap device? */ if (get_nr_swap_pages() > 0) return true; } else { /* Is the memcg below its swap limit? */ if (mem_cgroup_get_nr_swap_pages(memcg) > 0) return true; } /* * The page can not be swapped. * * Can it be reclaimed from this node via demotion? */ return can_demote(nid, sc, memcg); } /* * This misses isolated folios which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is * not expected that isolated folios will be a dominating factor. */ unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); return nr; } /** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector * @lru: lru to use * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) */ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { unsigned long size = 0; int zid; struct zone *zone; for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) { if (!mem_cgroup_disabled()) size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); else size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); } return size; } static unsigned long drop_slab_node(int nid) { unsigned long freed = 0; struct mem_cgroup *memcg = NULL; memcg = mem_cgroup_iter(NULL, NULL, NULL); do { freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); return freed; } void drop_slab(void) { int nid; int shift = 0; unsigned long freed; do { freed = 0; for_each_online_node(nid) { if (fatal_signal_pending(current)) return; freed += drop_slab_node(nid); } } while ((freed >> shift++) > 1); } #define CHECK_RECLAIMER_OFFSET(type) \ do { \ BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \ PGDEMOTE_##type - PGDEMOTE_KSWAPD); \ BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \ PGSCAN_##type - PGSCAN_KSWAPD); \ } while (0) static int reclaimer_offset(struct scan_control *sc) { CHECK_RECLAIMER_OFFSET(DIRECT); CHECK_RECLAIMER_OFFSET(KHUGEPAGED); CHECK_RECLAIMER_OFFSET(PROACTIVE); if (current_is_kswapd()) return 0; if (current_is_khugepaged()) return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; if (sc->proactive) return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD; return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; } /* * We detected a synchronous write error writing a folio out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent * fsync(), msync() or close(). * * The tricky part is that after writepage we cannot touch the mapping: nothing * prevents it from being freed up. But we have a ref on the folio and once * that folio is locked, the mapping is pinned. * * We're allowed to run sleeping folio_lock() here because we know the caller has * __GFP_FS. */ static void handle_write_error(struct address_space *mapping, struct folio *folio, int error) { folio_lock(folio); if (folio_mapping(folio) == mapping) mapping_set_error(mapping, error); folio_unlock(folio); } static bool skip_throttle_noprogress(pg_data_t *pgdat) { int reclaimable = 0, write_pending = 0; int i; struct zone *zone; /* * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM. */ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; /* * If there are a lot of dirty/writeback folios then do not * throttle as throttling will occur when the folios cycle * towards the end of the LRU if still under writeback. */ for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) { reclaimable += zone_reclaimable_pages(zone); write_pending += zone_page_state_snapshot(zone, NR_ZONE_WRITE_PENDING); } if (2 * write_pending <= reclaimable) return true; return false; } void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) { wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; long timeout, ret; DEFINE_WAIT(wait); /* * Do not throttle user workers, kthreads other than kswapd or * workqueues. They may be required for reclaim to make * forward progress (e.g. journalling workqueues or kthreads). */ if (!current_is_kswapd() && current->flags & (PF_USER_WORKER|PF_KTHREAD)) { cond_resched(); return; } /* * These figures are pulled out of thin air. * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many * parallel reclaimers which is a short-lived event so the timeout is * short. Failing to make progress or waiting on writeback are * potentially long-lived events so use a longer timeout. This is shaky * logic as a failure to make progress could be due to anything from * writeback to a slow device to excessive referenced folios at the tail * of the inactive LRU. */ switch(reason) { case VMSCAN_THROTTLE_WRITEBACK: timeout = HZ/10; if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { WRITE_ONCE(pgdat->nr_reclaim_start, node_page_state(pgdat, NR_THROTTLED_WRITTEN)); } break; case VMSCAN_THROTTLE_CONGESTED: fallthrough; case VMSCAN_THROTTLE_NOPROGRESS: if (skip_throttle_noprogress(pgdat)) { cond_resched(); return; } timeout = 1; break; case VMSCAN_THROTTLE_ISOLATED: timeout = HZ/50; break; default: WARN_ON_ONCE(1); timeout = HZ; break; } prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = schedule_timeout(timeout); finish_wait(wqh, &wait); if (reason == VMSCAN_THROTTLE_WRITEBACK) atomic_dec(&pgdat->nr_writeback_throttled); trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout), jiffies_to_usecs(timeout - ret), reason); } /* * Account for folios written if tasks are throttled waiting on dirty * folios to clean. If enough folios have been cleaned since throttling * started then wakeup the throttled tasks. */ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, int nr_throttled) { unsigned long nr_written; node_stat_add_folio(folio, NR_THROTTLED_WRITTEN); /* * This is an inaccurate read as the per-cpu deltas may not * be synchronised. However, given that the system is * writeback throttled, it is not worth taking the penalty * of getting an accurate count. At worst, the throttle * timeout guarantees forward progress. */ nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) - READ_ONCE(pgdat->nr_reclaim_start); if (nr_written > SWAP_CLUSTER_MAX * nr_throttled) wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]); } /* possible outcome of pageout() */ typedef enum { /* failed to write folio out, folio is locked */ PAGE_KEEP, /* move folio to the active list, folio is locked */ PAGE_ACTIVATE, /* folio has been sent to the disk successfully, folio is unlocked */ PAGE_SUCCESS, /* folio is clean and locked */ PAGE_CLEAN, } pageout_t; static pageout_t writeout(struct folio *folio, struct address_space *mapping, struct swap_iocb **plug, struct list_head *folio_list) { int res; folio_set_reclaim(folio); /* * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled * or we failed to allocate contiguous swap entries, in which case * the split out folios get added back to folio_list. */ if (shmem_mapping(mapping)) res = shmem_writeout(folio, plug, folio_list); else res = swap_writeout(folio, plug); if (res < 0) handle_write_error(mapping, folio, res); if (res == AOP_WRITEPAGE_ACTIVATE) { folio_clear_reclaim(folio); return PAGE_ACTIVATE; } /* synchronous write? */ if (!folio_test_writeback(folio)) folio_clear_reclaim(folio); trace_mm_vmscan_write_folio(folio); node_stat_add_folio(folio, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } /* * pageout is called by shrink_folio_list() for each dirty folio. */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, struct swap_iocb **plug, struct list_head *folio_list) { /* * We no longer attempt to writeback filesystem folios here, other * than tmpfs/shmem. That's taken care of in page-writeback. * If we find a dirty filesystem folio at the end of the LRU list, * typically that means the filesystem is saturating the storage * with contiguous writes and telling it to write a folio here * would only make the situation worse by injecting an element * of random access. * * If the folio is swapcache, write it back even if that would * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. * * A freeable shmem or swapcache folio is referenced only by the * caller that isolated the folio and the page cache. */ if (folio_ref_count(folio) != 1 + folio_nr_pages(folio) || !mapping) return PAGE_KEEP; if (!shmem_mapping(mapping) && !folio_test_anon(folio)) return PAGE_ACTIVATE; if (!folio_clear_dirty_for_io(folio)) return PAGE_CLEAN; return writeout(folio, mapping, plug, folio_list); } /* * Same as remove_mapping, but if the folio is removed from the mapping, it * gets returned with a refcount of 0. */ static int __remove_mapping(struct address_space *mapping, struct folio *folio, bool reclaimed, struct mem_cgroup *target_memcg) { int refcount; void *shadow = NULL; struct swap_cluster_info *ci; BUG_ON(!folio_test_locked(folio)); BUG_ON(mapping != folio_mapping(folio)); if (folio_test_swapcache(folio)) { ci = swap_cluster_get_and_lock_irq(folio); } else { spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); } /* * The non racy check for a busy folio. * * Must be careful with the order of the tests. When someone has * a ref to the folio, it may be possible that they dirty it then * drop the reference. So if the dirty flag is tested before the * refcount here, then the following race may occur: * * get_user_pages(&page); * [user mapping goes away] * write_to(page); * !folio_test_dirty(folio) [good] * folio_set_dirty(folio); * folio_put(folio); * !refcount(folio) [good, discard it] * * [oops, our write_to data is lost] * * Reversing the order of the tests ensures such a situation cannot * escape unnoticed. The smp_rmb is needed to ensure the folio->flags * load is not satisfied before that of folio->_refcount. * * Note that if the dirty flag is always set via folio_mark_dirty, * and thus under the i_pages lock, then this ordering is not required. */ refcount = 1 + folio_nr_pages(folio); if (!folio_ref_freeze(folio, refcount)) goto cannot_free; /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */ if (unlikely(folio_test_dirty(folio))) { folio_ref_unfreeze(folio, refcount); goto cannot_free; } if (folio_test_swapcache(folio)) { swp_entry_t swap = folio->swap; if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); __swap_cache_del_folio(ci, folio, swap, shadow); memcg1_swapout(folio, swap); swap_cluster_unlock_irq(ci); put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *); free_folio = mapping->a_ops->free_folio; /* * Remember a shadow entry for reclaimed file cache in * order to detect refaults, thus thrashing, later on. * * But don't store shadows in an address space that is * already exiting. This is not just an optimization, * inode reclaim needs to empty out the radix tree or * the nodes are lost. Don't plant shadows behind its * back. * * We also don't store shadows for DAX mappings because the * only page cache folios found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the * same address_space. */ if (reclaimed && folio_is_file_lru(folio) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(folio, target_memcg); __filemap_remove_folio(folio, shadow); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); if (free_folio) free_folio(folio); } return 1; cannot_free: if (folio_test_swapcache(folio)) { swap_cluster_unlock_irq(ci); } else { xa_unlock_irq(&mapping->i_pages); spin_unlock(&mapping->host->i_lock); } return 0; } /** * remove_mapping() - Attempt to remove a folio from its mapping. * @mapping: The address space. * @folio: The folio to remove. * * If the folio is dirty, under writeback or if someone else has a ref * on it, removal will fail. * Return: The number of pages removed from the mapping. 0 if the folio * could not be removed. * Context: The caller should have a single refcount on the folio and * hold its lock. */ long remove_mapping(struct address_space *mapping, struct folio *folio) { if (__remove_mapping(mapping, folio, false, NULL)) { /* * Unfreezing the refcount with 1 effectively * drops the pagecache ref for us without requiring another * atomic operation. */ folio_ref_unfreeze(folio, 1); return folio_nr_pages(folio); } return 0; } /** * folio_putback_lru - Put previously isolated folio onto appropriate LRU list. * @folio: Folio to be returned to an LRU list. * * Add previously isolated @folio to appropriate LRU list. * The folio may still be unevictable for other reasons. * * Context: lru_lock must not be held, interrupts must be enabled. */ void folio_putback_lru(struct folio *folio) { folio_add_lru(folio); folio_put(folio); /* drop ref from isolate */ } enum folio_references { FOLIOREF_RECLAIM, FOLIOREF_RECLAIM_CLEAN, FOLIOREF_KEEP, FOLIOREF_ACTIVATE, }; #ifdef CONFIG_LRU_GEN /* * Only used on a mapped folio in the eviction (rmap walk) path, where promotion * needs to be done by taking the folio off the LRU list and then adding it back * with PG_active set. In contrast, the aging (page table walk) path uses * folio_update_gen(). */ static bool lru_gen_set_refs(struct folio *folio) { /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return false; } set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset)); return true; } #else static bool lru_gen_set_refs(struct folio *folio) { return false; } #endif /* CONFIG_LRU_GEN */ static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { int referenced_ptes, referenced_folio; vm_flags_t vm_flags; referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, &vm_flags); /* * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. * Let the folio, now marked Mlocked, be moved to the unevictable list. */ if (vm_flags & VM_LOCKED) return FOLIOREF_ACTIVATE; /* * There are two cases to consider. * 1) Rmap lock contention: rotate. * 2) Skip the non-shared swapbacked folio mapped solely by * the exiting or OOM-reaped process. */ if (referenced_ptes == -1) return FOLIOREF_KEEP; if (lru_gen_enabled()) { if (!referenced_ptes) return FOLIOREF_RECLAIM; return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP; } referenced_folio = folio_test_clear_referenced(folio); if (referenced_ptes) { /* * All mapped folios start out with page table * references from the instantiating fault, so we need * to look twice if a mapped file/anon folio is used more * than once. * * Mark it and spare it for another trip around the * inactive list. Another page table reference will * lead to its activation. * * Note: the mark is set for activated folios as well * so that recently deactivated but used folios are * quickly recovered. */ folio_set_referenced(folio); if (referenced_folio || referenced_ptes > 1) return FOLIOREF_ACTIVATE; /* * Activate file-backed executable folios after first usage. */ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) return FOLIOREF_ACTIVATE; return FOLIOREF_KEEP; } /* Reclaim if clean, defer dirty folios to writeback */ if (referenced_folio && folio_is_file_lru(folio)) return FOLIOREF_RECLAIM_CLEAN; return FOLIOREF_RECLAIM; } /* Check if a folio is dirty or under writeback */ static void folio_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback) { struct address_space *mapping; /* * Anonymous folios are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them. * MADV_FREE anonymous folios are put into inactive file list too. * They could be mistakenly treated as file lru. So further anon * test is needed. */ if (!folio_is_file_lru(folio) || (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { *dirty = false; *writeback = false; return; } /* By default assume that the folio flags are accurate */ *dirty = folio_test_dirty(folio); *writeback = folio_test_writeback(folio); /* Verify dirty/writeback state if the filesystem supports it */ if (!folio_test_private(folio)) return; mapping = folio_mapping(folio); if (mapping && mapping->a_ops->is_dirty_writeback) mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); } static struct folio *alloc_demote_folio(struct folio *src, unsigned long private) { struct folio *dst; nodemask_t *allowed_mask; struct migration_target_control *mtc; mtc = (struct migration_target_control *)private; allowed_mask = mtc->nmask; /* * make sure we allocate from the target node first also trying to * demote or reclaim pages from the target node via kswapd if we are * low on free memory on target node. If we don't do this and if * we have free memory on the slower(lower) memtier, we would start * allocating pages from slower(lower) memory tiers without even forcing * a demotion of cold pages from the target memtier. This can result * in the kernel placing hot pages in slower(lower) memory tiers. */ mtc->nmask = NULL; mtc->gfp_mask |= __GFP_THISNODE; dst = alloc_migration_target(src, (unsigned long)mtc); if (dst) return dst; mtc->gfp_mask &= ~__GFP_THISNODE; mtc->nmask = allowed_mask; return alloc_migration_target(src, (unsigned long)mtc); } /* * Take folios on @demote_folios and attempt to demote them to another node. * Folios which are not demoted are left on @demote_folios. */ static unsigned int demote_folio_list(struct list_head *demote_folios, struct pglist_data *pgdat) { int target_nid = next_demotion_node(pgdat->node_id); unsigned int nr_succeeded; nodemask_t allowed_mask; struct migration_target_control mtc = { /* * Allocate from 'node', or fail quickly and quietly. * When this happens, 'page' will likely just be discarded * instead of migrated. */ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOMEMALLOC | GFP_NOWAIT, .nid = target_nid, .nmask = &allowed_mask, .reason = MR_DEMOTION, }; if (list_empty(demote_folios)) return 0; if (target_nid == NUMA_NO_NODE) return 0; node_get_allowed_targets(pgdat, &allowed_mask); /* Demotion ignores all cpuset and mempolicy settings */ migrate_pages(demote_folios, alloc_demote_folio, NULL, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); return nr_succeeded; } static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) { if (gfp_mask & __GFP_FS) return true; if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO)) return false; /* * We can "enter_fs" for swap-cache with only __GFP_IO * providing this isn't SWP_FS_OPS. * ->flags can be updated non-atomicially (scan_swap_map_slots), * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ return !data_race(folio_swap_flags(folio) & SWP_FS_OPS); } /* * shrink_folio_list() returns the number of reclaimed pages */ static unsigned int shrink_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, struct scan_control *sc, struct reclaim_stat *stat, bool ignore_references, struct mem_cgroup *memcg) { struct folio_batch free_folios; LIST_HEAD(ret_folios); LIST_HEAD(demote_folios); unsigned int nr_reclaimed = 0, nr_demoted = 0; unsigned int pgactivate = 0; bool do_demote_pass; struct swap_iocb *plug = NULL; folio_batch_init(&free_folios); memset(stat, 0, sizeof(*stat)); cond_resched(); do_demote_pass = can_demote(pgdat->node_id, sc, memcg); retry: while (!list_empty(folio_list)) { struct address_space *mapping; struct folio *folio; enum folio_references references = FOLIOREF_RECLAIM; bool dirty, writeback; unsigned int nr_pages; cond_resched(); folio = lru_to_folio(folio_list); list_del(&folio->lru); if (!folio_trylock(folio)) goto keep; if (folio_contain_hwpoisoned_page(folio)) { /* * unmap_poisoned_folio() can't handle large * folio, just skip it. memory_failure() will * handle it if the UCE is triggered again. */ if (folio_test_large(folio)) goto keep_locked; unmap_poisoned_folio(folio, folio_pfn(folio), false); folio_unlock(folio); folio_put(folio); continue; } VM_BUG_ON_FOLIO(folio_test_active(folio), folio); nr_pages = folio_nr_pages(folio); /* Account the number of base pages */ sc->nr_scanned += nr_pages; if (unlikely(!folio_evictable(folio))) goto activate_locked; if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked; /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing * folios if the tail of the LRU is all dirty unqueued folios. */ folio_check_dirty_writeback(folio, &dirty, &writeback); if (dirty || writeback) stat->nr_dirty += nr_pages; if (dirty && !writeback) stat->nr_unqueued_dirty += nr_pages; /* * Treat this folio as congested if folios are cycling * through the LRU so quickly that the folios marked * for immediate reclaim are making it to the end of * the LRU a second time. */ if (writeback && folio_test_reclaim(folio)) stat->nr_congested += nr_pages; /* * If a folio at the tail of the LRU is under writeback, there * are three cases to consider. * * 1) If reclaim is encountering an excessive number * of folios under writeback and this folio has both * the writeback and reclaim flags set, then it * indicates that folios are being queued for I/O but * are being recycled through the LRU before the I/O * can complete. Waiting on the folio itself risks an * indefinite stall if it is impossible to writeback * the folio due to I/O error or disconnected storage * so instead note that the LRU is being scanned too * quickly and the caller can stall after the folio * list has been processed. * * 2) Global or new memcg reclaim encounters a folio that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, * not to fs), or the folio belongs to a mapping where * waiting on writeback during reclaim may lead to a deadlock. * In this case mark the folio for immediate reclaim and * continue scanning. * * Require may_enter_fs() because we would wait on fs, which * may not have submitted I/O yet. And the loop driver might * enter reclaim, and deadlock if it waits on a folio for * which it is needed to do the write (loop masks off * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * * 3) Legacy memcg encounters a folio that already has the * reclaim flag set. memcg does not have any dirty folio * throttling so we could easily OOM just because too many * folios are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. * * In cases 1) and 2) we activate the folios to get them out of * the way while we continue scanning for clean folios on the * inactive list and refilling from the active list. The * observation here is that waiting for disk writes is more * expensive than potentially causing reloads down the line. * Since they're marked for immediate reclaim, they won't put * memory pressure on the cache working set any longer than it * takes to write them to disk. */ if (folio_test_writeback(folio)) { mapping = folio_mapping(folio); /* Case 1 above */ if (current_is_kswapd() && folio_test_reclaim(folio) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { stat->nr_immediate += nr_pages; goto activate_locked; /* Case 2 above */ } else if (writeback_throttling_sane(sc) || !folio_test_reclaim(folio) || !may_enter_fs(folio, sc->gfp_mask) || (mapping && mapping_writeback_may_deadlock_on_reclaim(mapping))) { /* * This is slightly racy - * folio_end_writeback() might have * just cleared the reclaim flag, then * setting the reclaim flag here ends up * interpreted as the readahead flag - but * that does not matter enough to care. * What we do want is for this folio to * have the reclaim flag set next time * memcg reclaim reaches the tests above, * so it will then wait for writeback to * avoid OOM; and it's also appropriate * in global reclaim. */ folio_set_reclaim(folio); stat->nr_writeback += nr_pages; goto activate_locked; /* Case 3 above */ } else { folio_unlock(folio); folio_wait_writeback(folio); /* then go back and try same folio again */ list_add_tail(&folio->lru, folio_list); continue; } } if (!ignore_references) references = folio_check_references(folio, sc); switch (references) { case FOLIOREF_ACTIVATE: goto activate_locked; case FOLIOREF_KEEP: stat->nr_ref_keep += nr_pages; goto keep_locked; case FOLIOREF_RECLAIM: case FOLIOREF_RECLAIM_CLEAN: ; /* try to reclaim the folio below */ } /* * Before reclaiming the folio, try to relocate * its contents to another node. */ if (do_demote_pass && (thp_migration_supported() || !folio_test_large(folio))) { list_add(&folio->lru, &demote_folios); folio_unlock(folio); continue; } /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. * Lazyfree folio could be freed directly */ if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { if (!folio_test_swapcache(folio)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (folio_maybe_dma_pinned(folio)) goto keep_locked; if (folio_test_large(folio)) { /* cannot split folio, skip it */ if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1) goto activate_locked; /* * Split partially mapped folios right away. * We can free the unmapped pages without IO. */ if (data_race(!list_empty(&folio->_deferred_list) && folio_test_partially_mapped(folio)) && split_folio_to_list(folio, folio_list)) goto activate_locked; } if (folio_alloc_swap(folio)) { int __maybe_unused order = folio_order(folio); if (!folio_test_large(folio)) goto activate_locked_split; /* Fallback to swap normal pages */ if (split_folio_to_list(folio, folio_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (nr_pages >= HPAGE_PMD_NR) { count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1); count_vm_event(THP_SWPOUT_FALLBACK); } #endif count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); if (folio_alloc_swap(folio)) goto activate_locked_split; } /* * Normally the folio will be dirtied in unmap because its * pte should be dirty. A special case is MADV_FREE page. The * page's pte could have dirty bit cleared but the folio's * SwapBacked flag is still set because clearing the dirty bit * and SwapBacked flag has no lock protected. For such folio, * unmap will not set dirty bit for it, so folio reclaim will * not write the folio out. This can cause data corruption when * the folio is swapped in later. Always setting the dirty flag * for the folio solves the problem. */ folio_mark_dirty(folio); } } /* * If the folio was split above, the tail pages will make * their own pass through this function and be accounted * then. */ if ((nr_pages > 1) && !folio_test_large(folio)) { sc->nr_scanned -= (nr_pages - 1); nr_pages = 1; } /* * The folio is mapped into the page tables of one or more * processes. Try to unmap it here. */ if (folio_mapped(folio)) { enum ttu_flags flags = TTU_BATCH_FLUSH; bool was_swapbacked = folio_test_swapbacked(folio); if (folio_test_pmd_mappable(folio)) flags |= TTU_SPLIT_HUGE_PMD; /* * Without TTU_SYNC, try_to_unmap will only begin to * hold PTL from the first present PTE within a large * folio. Some initial PTEs might be skipped due to * races with parallel PTE writes in which PTEs can be * cleared temporarily before being written new present * values. This will lead to a large folio is still * mapped while some subpages have been partially * unmapped after try_to_unmap; TTU_SYNC helps * try_to_unmap acquire PTL from the first PTE, * eliminating the influence of temporary PTE values. */ if (folio_test_large(folio)) flags |= TTU_SYNC; try_to_unmap(folio, flags); if (folio_mapped(folio)) { stat->nr_unmap_fail += nr_pages; if (!was_swapbacked && folio_test_swapbacked(folio)) stat->nr_lazyfree_fail += nr_pages; goto activate_locked; } } /* * Folio is unmapped now so it cannot be newly pinned anymore. * No point in trying to reclaim folio if it is pinned. * Furthermore we don't want to reclaim underlying fs metadata * if the folio is pinned and thus potentially modified by the * pinning process as that may upset the filesystem. */ if (folio_maybe_dma_pinned(folio)) goto activate_locked; mapping = folio_mapping(folio); if (folio_test_dirty(folio)) { if (folio_is_file_lru(folio)) { /* * Immediately reclaim when written back. * Similar in principle to folio_deactivate() * except we already have the folio isolated * and know it's dirty */ node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, nr_pages); if (!folio_test_reclaim(folio)) folio_set_reclaim(folio); goto activate_locked; } if (references == FOLIOREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs(folio, sc->gfp_mask)) goto keep_locked; if (!sc->may_writepage) goto keep_locked; /* * Folio is dirty. Flush the TLB if a writable entry * potentially exists to avoid CPU writes after I/O * starts and then write it out here. */ try_to_unmap_flush_dirty(); switch (pageout(folio, mapping, &plug, folio_list)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: /* * If shmem folio is split when writeback to swap, * the tail pages will make their own pass through * this function and be accounted then. */ if (nr_pages > 1 && !folio_test_large(folio)) { sc->nr_scanned -= (nr_pages - 1); nr_pages = 1; } goto activate_locked; case PAGE_SUCCESS: if (nr_pages > 1 && !folio_test_large(folio)) { sc->nr_scanned -= (nr_pages - 1); nr_pages = 1; } stat->nr_pageout += nr_pages; if (folio_test_writeback(folio)) goto keep; if (folio_test_dirty(folio)) goto keep; /* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the folio. */ if (!folio_trylock(folio)) goto keep; if (folio_test_dirty(folio) || folio_test_writeback(folio)) goto keep_locked; mapping = folio_mapping(folio); fallthrough; case PAGE_CLEAN: ; /* try to free the folio below */ } } /* * If the folio has buffers, try to free the buffer * mappings associated with this folio. If we succeed * we try to free the folio as well. * * We do this even if the folio is dirty. * filemap_release_folio() does not perform I/O, but it * is possible for a folio to have the dirty flag set, * but it is actually clean (all its buffers are clean). * This happens if the buffers were written out directly, * with submit_bh(). ext3 will do this, as well as * the blockdev mapping. filemap_release_folio() will * discover that cleanness and will drop the buffers * and mark the folio clean - it can be freed. * * Rarely, folios can have buffers and no ->mapping. * These are the folios which were not successfully * invalidated in truncate_cleanup_folio(). We try to * drop those buffers here and if that worked, and the * folio is no longer mapped into process address space * (refcount == 1) it can be freed. Otherwise, leave * the folio on the LRU so it is swappable. */ if (folio_needs_release(folio)) { if (!filemap_release_folio(folio, sc->gfp_mask)) goto activate_locked; if (!mapping && folio_ref_count(folio) == 1) { folio_unlock(folio); if (folio_put_testzero(folio)) goto free_it; else { /* * rare race with speculative reference. * the speculative reference will free * this folio shortly, so we may * increment nr_reclaimed here (and * leave it off the LRU). */ nr_reclaimed += nr_pages; continue; } } } if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { /* follow __remove_mapping for reference */ if (!folio_ref_freeze(folio, 1)) goto keep_locked; /* * The folio has only one reference left, which is * from the isolation. After the caller puts the * folio back on the lru and drops the reference, the * folio will be freed anyway. It doesn't matter * which lru it goes on. So we don't bother checking * the dirty flag here. */ count_vm_events(PGLAZYFREED, nr_pages); count_memcg_folio_events(folio, PGLAZYFREED, nr_pages); } else if (!mapping || !__remove_mapping(mapping, folio, true, sc->target_mem_cgroup)) goto keep_locked; folio_unlock(folio); free_it: /* * Folio may get swapped out as a whole, need to account * all pages in it. */ nr_reclaimed += nr_pages; folio_unqueue_deferred_split(folio); if (folio_batch_add(&free_folios, folio) == 0) { mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); free_unref_folios(&free_folios); } continue; activate_locked_split: /* * The tail pages that are failed to add into swap cache * reach here. Fixup nr_scanned and nr_pages. */ if (nr_pages > 1) { sc->nr_scanned -= (nr_pages - 1); nr_pages = 1; } activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (folio_test_swapcache(folio) && (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio))) folio_free_swap(folio); VM_BUG_ON_FOLIO(folio_test_active(folio), folio); if (!folio_test_mlocked(folio)) { int type = folio_is_file_lru(folio); folio_set_active(folio); stat->nr_activate[type] += nr_pages; count_memcg_folio_events(folio, PGACTIVATE, nr_pages); } keep_locked: folio_unlock(folio); keep: list_add(&folio->lru, &ret_folios); VM_BUG_ON_FOLIO(folio_test_lru(folio) || folio_test_unevictable(folio), folio); } /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */ nr_demoted = demote_folio_list(&demote_folios, pgdat); nr_reclaimed += nr_demoted; stat->nr_demoted += nr_demoted; /* Folios that could not be demoted are still in @demote_folios */ if (!list_empty(&demote_folios)) { /* Folios which weren't demoted go back on @folio_list */ list_splice_init(&demote_folios, folio_list); /* * goto retry to reclaim the undemoted folios in folio_list if * desired. * * Reclaiming directly from top tier nodes is not often desired * due to it breaking the LRU ordering: in general memory * should be reclaimed from lower tier nodes and demoted from * top tier nodes. * * However, disabling reclaim from top tier nodes entirely * would cause ooms in edge scenarios where lower tier memory * is unreclaimable for whatever reason, eg memory being * mlocked or too hot to reclaim. We can disable reclaim * from top tier nodes in proactive reclaim though as that is * not real memory pressure. */ if (!sc->proactive) { do_demote_pass = false; goto retry; } } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); free_unref_folios(&free_folios); list_splice(&ret_folios, folio_list); count_vm_events(PGACTIVATE, pgactivate); if (plug) swap_write_unplug(plug); return nr_reclaimed; } unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list) { struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_unmap = 1, }; struct reclaim_stat stat; unsigned int nr_reclaimed; struct folio *folio, *next; LIST_HEAD(clean_folios); unsigned int noreclaim_flag; list_for_each_entry_safe(folio, next, folio_list, lru) { /* TODO: these pages should not even appear in this list. */ if (page_has_movable_ops(&folio->page)) continue; if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) && !folio_test_dirty(folio) && !folio_test_unevictable(folio)) { folio_clear_active(folio); list_move(&folio->lru, &clean_folios); } } /* * We should be safe here since we are only dealing with file pages and * we are not kswapd and therefore cannot write dirty file pages. But * call memalloc_noreclaim_save() anyway, just in case these conditions * change in the future. */ noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, &stat, true, NULL); memalloc_noreclaim_restore(noreclaim_flag); list_splice(&clean_folios, folio_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -(long)nr_reclaimed); /* * Since lazyfree pages are isolated from file LRU from the beginning, * they will rotate back to anonymous LRU in the end if it failed to * discard so isolated count will be mismatched. * Compensate the isolated count for both LRU lists. */ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, stat.nr_lazyfree_fail); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -(long)stat.nr_lazyfree_fail); return nr_reclaimed; } /* * Update LRU sizes after isolating pages. The LRU size updates must * be complete before mem_cgroup_update_lru_size due to a sanity check. */ static __always_inline void update_lru_sizes(struct lruvec *lruvec, enum lru_list lru, unsigned long *nr_zone_taken) { int zid; for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_zone_taken[zid]) continue; update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); } } /* * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. * * lruvec->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages * and working on them outside the LRU lock. * * For pagecache intensive workloads, this function is the hottest * spot in the kernel (apart from copy_*_user functions). * * Lru_lock must be held before calling this function. * * @nr_to_scan: The number of eligible pages to look through on the list. * @lruvec: The LRU vector to pull pages from. * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. * @sc: The scan_control struct for this reclaim session * @lru: LRU list id for isolating * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, enum lru_list lru) { struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; unsigned long skipped = 0, total_scan = 0, scan = 0; unsigned long nr_pages; unsigned long max_nr_skipped = 0; LIST_HEAD(folios_skipped); while (scan < nr_to_scan && !list_empty(src)) { struct list_head *move_to = src; struct folio *folio; folio = lru_to_folio(src); prefetchw_prev_lru_folio(folio, src, flags); nr_pages = folio_nr_pages(folio); total_scan += nr_pages; /* Using max_nr_skipped to prevent hard LOCKUP*/ if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED && (folio_zonenum(folio) > sc->reclaim_idx)) { nr_skipped[folio_zonenum(folio)] += nr_pages; move_to = &folios_skipped; max_nr_skipped++; goto move; } /* * Do not count skipped folios because that makes the function * return with no isolated folios if the LRU mostly contains * ineligible folios. This causes the VM to not reclaim any * folios, triggering a premature OOM. * Account all pages in a folio. */ scan += nr_pages; if (!folio_test_lru(folio)) goto move; if (!sc->may_unmap && folio_mapped(folio)) goto move; /* * Be careful not to clear the lru flag until after we're * sure the folio is not being freed elsewhere -- the * folio release code relies on it. */ if (unlikely(!folio_try_get(folio))) goto move; if (!folio_test_clear_lru(folio)) { /* Another thread is already isolating this folio */ folio_put(folio); goto move; } nr_taken += nr_pages; nr_zone_taken[folio_zonenum(folio)] += nr_pages; move_to = dst; move: list_move(&folio->lru, move_to); } /* * Splice any skipped folios to the start of the LRU list. Note that * this disrupts the LRU order when reclaiming for lower zones but * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX * scanning would soon rescan the same folios to skip and waste lots * of cpu cycles. */ if (!list_empty(&folios_skipped)) { int zid; list_splice(&folios_skipped, src); for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_skipped[zid]) continue; __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); skipped += nr_skipped[zid]; } } *nr_scanned = total_scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, total_scan, skipped, nr_taken, lru); update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } /** * folio_isolate_lru() - Try to isolate a folio from its LRU list. * @folio: Folio to isolate from its LRU list. * * Isolate a @folio from an LRU list and adjust the vmstat statistic * corresponding to whatever LRU list the folio was on. * * The folio will have its LRU flag cleared. If it was found on the * active list, it will have the Active flag set. If it was found on the * unevictable list, it will have the Unevictable flag set. These flags * may need to be cleared by the caller before letting the page go. * * Context: * * (1) Must be called with an elevated refcount on the folio. This is a * fundamental difference from isolate_lru_folios() (which is called * without a stable reference). * (2) The lru_lock must not be held. * (3) Interrupts must be enabled. * * Return: true if the folio was removed from an LRU list. * false if the folio was not on an LRU list. */ bool folio_isolate_lru(struct folio *folio) { bool ret = false; VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); if (folio_test_clear_lru(folio)) { struct lruvec *lruvec; folio_get(folio); lruvec = folio_lruvec_lock_irq(folio); lruvec_del_folio(lruvec, folio); unlock_page_lruvec_irq(lruvec); ret = true; } return ret; } /* * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and * then get rescheduled. When there are massive number of tasks doing page * allocation, such sleeping direct reclaimers may keep piling up on each CPU, * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM. */ static bool too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc) { unsigned long inactive, isolated; bool too_many; if (current_is_kswapd()) return false; if (!writeback_throttling_sane(sc)) return false; if (file) { inactive = node_page_state(pgdat, NR_INACTIVE_FILE); isolated = node_page_state(pgdat, NR_ISOLATED_FILE); } else { inactive = node_page_state(pgdat, NR_INACTIVE_ANON); isolated = node_page_state(pgdat, NR_ISOLATED_ANON); } /* * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they * won't get blocked by normal direct-reclaimers, forming a circular * deadlock. */ if (gfp_has_io_fs(sc->gfp_mask)) inactive >>= 3; too_many = isolated > inactive; /* Wake up tasks throttled due to too_many_isolated. */ if (!too_many) wake_throttle_isolated(pgdat); return too_many; } /* * move_folios_to_lru() moves folios from private @list to appropriate LRU list. * * Returns the number of pages moved to the given lruvec. */ static unsigned int move_folios_to_lru(struct lruvec *lruvec, struct list_head *list) { int nr_pages, nr_moved = 0; struct folio_batch free_folios; folio_batch_init(&free_folios); while (!list_empty(list)) { struct folio *folio = lru_to_folio(list); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); list_del(&folio->lru); if (unlikely(!folio_evictable(folio))) { spin_unlock_irq(&lruvec->lru_lock); folio_putback_lru(folio); spin_lock_irq(&lruvec->lru_lock); continue; } /* * The folio_set_lru needs to be kept here for list integrity. * Otherwise: * #0 move_folios_to_lru #1 release_pages * if (!folio_put_testzero()) * if (folio_put_testzero()) * !lru //skip lru_lock * folio_set_lru() * list_add(&folio->lru,) * list_add(&folio->lru,) */ folio_set_lru(folio); if (unlikely(folio_put_testzero(folio))) { __folio_clear_lru_flags(folio); folio_unqueue_deferred_split(folio); if (folio_batch_add(&free_folios, folio) == 0) { spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_folios(&free_folios); free_unref_folios(&free_folios); spin_lock_irq(&lruvec->lru_lock); } continue; } /* * All pages were isolated from the same lruvec (and isolation * inhibits memcg migration). */ VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); lruvec_add_folio(lruvec, folio); nr_pages = folio_nr_pages(folio); nr_moved += nr_pages; if (folio_test_active(folio)) workingset_age_nonresident(lruvec, nr_pages); } if (free_folios.nr) { spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_folios(&free_folios); free_unref_folios(&free_folios); spin_lock_irq(&lruvec->lru_lock); } return nr_moved; } /* * If a kernel thread (such as nfsd for loop-back mounts) services a backing * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case * we should not throttle. Otherwise it is safe to do so. */ static int current_may_throttle(void) { return !(current->flags & PF_LOCAL_THROTTLE); } /* * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) { LIST_HEAD(folio_list); unsigned long nr_scanned; unsigned int nr_reclaimed = 0; unsigned long nr_taken; struct reclaim_stat stat; bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); bool stalled = false; while (unlikely(too_many_isolated(pgdat, file, sc))) { if (stalled) return 0; /* wait a bit for the reclaimer. */ stalled = true; reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED); /* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) return SWAP_CLUSTER_MAX; } lru_add_drain(); spin_lock_irq(&lruvec->lru_lock); nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); item = PGSCAN_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); __count_vm_events(PGSCAN_ANON + file, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); if (nr_taken == 0) return 0; nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false, lruvec_memcg(lruvec)); spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), stat.nr_demoted); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); /* * If dirty folios are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can * happen when memory pressure pushes dirty folios to the end of * the LRU before the dirty limits are breached and the dirty * data has expired. It can also happen when the proportion of * dirty folios grows not through writes but through memory * pressure reclaiming all the clean cache. And in some cases, * the flushers simply cannot keep up with the allocation * rate. Nudge the flusher threads in case they are asleep. */ if (stat.nr_unqueued_dirty == nr_taken) { wakeup_flusher_threads(WB_REASON_VMSCAN); /* * For cgroupv1 dirty throttling is achieved by waking up * the kernel flusher here and later waiting on folios * which are in writeback to finish (see shrink_folio_list()). * * Flusher may not be able to issue writeback quickly * enough for cgroupv1 writeback throttling to work * on a large system. */ if (!writeback_throttling_sane(sc)) reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); } sc->nr.dirty += stat.nr_dirty; sc->nr.congested += stat.nr_congested; sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr.writeback += stat.nr_writeback; sc->nr.immediate += stat.nr_immediate; sc->nr.taken += nr_taken; if (file) sc->nr.file_taken += nr_taken; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, nr_scanned, nr_reclaimed, &stat, sc->priority, file); return nr_reclaimed; } /* * shrink_active_list() moves folios from the active LRU to the inactive LRU. * * We move them the other way if the folio is referenced by one or more * processes. * * If the folios are mostly unmapped, the processing is fast and it is * appropriate to hold lru_lock across the whole operation. But if * the folios are mapped, the processing is slow (folio_referenced()), so * we should drop lru_lock around each folio. It's impossible to balance * this, so instead we remove the folios from the LRU while processing them. * It is safe to rely on the active flag against the non-LRU folios in here * because nobody will play with that bit on a non-LRU folio. * * The downside is that we have to touch folio->_refcount against each folio. * But we had to alter folio->flags anyway. */ static void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) { unsigned long nr_taken; unsigned long nr_scanned; vm_flags_t vm_flags; LIST_HEAD(l_hold); /* The folios which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; bool file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); lru_add_drain(); spin_lock_irq(&lruvec->lru_lock); nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); if (!cgroup_reclaim(sc)) __count_vm_events(PGREFILL, nr_scanned); count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); while (!list_empty(&l_hold)) { struct folio *folio; cond_resched(); folio = lru_to_folio(&l_hold); list_del(&folio->lru); if (unlikely(!folio_evictable(folio))) { folio_putback_lru(folio); continue; } if (unlikely(buffer_heads_over_limit)) { if (folio_needs_release(folio) && folio_trylock(folio)) { filemap_release_folio(folio, 0); folio_unlock(folio); } } /* Referenced or rmap lock contention: rotate */ if (folio_referenced(folio, 0, sc->target_mem_cgroup, &vm_flags) != 0) { /* * Identify referenced, file-backed active folios and * give them one more trip around the active list. So * that executable code get better chances to stay in * memory under moderate memory pressure. Anon folios * are not likely to be evicted by use-once streaming * IO, plus JVM can create lots of anon VM_EXEC folios, * so we ignore them here. */ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) { nr_rotated += folio_nr_pages(folio); list_add(&folio->lru, &l_active); continue; } } folio_clear_active(folio); /* we are de-activating */ folio_set_workingset(folio); list_add(&folio->lru, &l_inactive); } /* * Move folios back to the lru list. */ spin_lock_irq(&lruvec->lru_lock); nr_activate = move_folios_to_lru(lruvec, &l_active); nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); __count_vm_events(PGDEACTIVATE, nr_deactivate); count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } static unsigned int reclaim_folio_list(struct list_head *folio_list, struct pglist_data *pgdat) { struct reclaim_stat stat; unsigned int nr_reclaimed; struct folio *folio; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_writepage = 1, .may_unmap = 1, .may_swap = 1, .no_demotion = 1, }; nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL); while (!list_empty(folio_list)) { folio = lru_to_folio(folio_list); list_del(&folio->lru); folio_putback_lru(folio); } trace_mm_vmscan_reclaim_pages(pgdat->node_id, sc.nr_scanned, nr_reclaimed, &stat); return nr_reclaimed; } unsigned long reclaim_pages(struct list_head *folio_list) { int nid; unsigned int nr_reclaimed = 0; LIST_HEAD(node_folio_list); unsigned int noreclaim_flag; if (list_empty(folio_list)) return nr_reclaimed; noreclaim_flag = memalloc_noreclaim_save(); nid = folio_nid(lru_to_folio(folio_list)); do { struct folio *folio = lru_to_folio(folio_list); if (nid == folio_nid(folio)) { folio_clear_active(folio); list_move(&folio->lru, &node_folio_list); continue; } nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); nid = folio_nid(lru_to_folio(folio_list)); } while (!list_empty(folio_list)); nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); memalloc_noreclaim_restore(noreclaim_flag); return nr_reclaimed; } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { if (sc->may_deactivate & (1 << is_file_lru(lru))) shrink_active_list(nr_to_scan, lruvec, sc, lru); else sc->skipped_deactivate = 1; return 0; } return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); } /* * The inactive anon list should be small enough that the VM never has * to do too much work. * * The inactive file list should be small enough to leave most memory * to the established workingset on the scan-resistant active list, * but large enough to avoid thrashing the aggregate readahead window. * * Both inactive lists should also be large enough that each inactive * folio has a chance to be referenced again before it is reclaimed. * * If that fails and refaulting is observed, the inactive list grows. * * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios * on this LRU, maintained by the pageout code. An inactive_ratio * of 3 means 3:1 or 25% of the folios are kept on the inactive list. * * total target max * memory ratio inactive * ------------------------------------- * 10MB 1 5MB * 100MB 1 50MB * 1GB 3 250MB * 10GB 10 0.9GB * 100GB 31 3GB * 1TB 101 10GB * 10TB 320 32GB */ static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) { enum lru_list active_lru = inactive_lru + LRU_ACTIVE; unsigned long inactive, active; unsigned long inactive_ratio; unsigned long gb; inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru); active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) inactive_ratio = int_sqrt(10 * gb); else inactive_ratio = 1; return inactive * inactive_ratio < active; } enum scan_balance { SCAN_EQUAL, SCAN_FRACT, SCAN_ANON, SCAN_FILE, }; static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) { unsigned long file; struct lruvec *target_lruvec; if (lru_gen_enabled()) return; target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); /* * Flush the memory cgroup stats in rate-limited way as we don't need * most accurate stats here. We may switch to regular stats flushing * in the future once it is cheap enough. */ mem_cgroup_flush_stats_ratelimited(sc->target_mem_cgroup); /* * Determine the scan balance between anon and file LRUs. */ spin_lock_irq(&target_lruvec->lru_lock); sc->anon_cost = target_lruvec->anon_cost; sc->file_cost = target_lruvec->file_cost; spin_unlock_irq(&target_lruvec->lru_lock); /* * Target desirable inactive:active list ratios for the anon * and file LRU lists. */ if (!sc->force_deactivate) { unsigned long refaults; /* * When refaults are being observed, it means a new * workingset is being established. Deactivate to get * rid of any stale active pages quickly. */ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); if (refaults != target_lruvec->refaults[WORKINGSET_ANON] || inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) sc->may_deactivate |= DEACTIVATE_ANON; else sc->may_deactivate &= ~DEACTIVATE_ANON; refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); if (refaults != target_lruvec->refaults[WORKINGSET_FILE] || inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) sc->may_deactivate |= DEACTIVATE_FILE; else sc->may_deactivate &= ~DEACTIVATE_FILE; } else sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; /* * If we have plenty of inactive file pages that aren't * thrashing, try to reclaim those first before touching * anonymous pages. */ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE) && !sc->no_cache_trim_mode) sc->cache_trim_mode = 1; else sc->cache_trim_mode = 0; /* * Prevent the reclaimer from falling into the cache trap: as * cache pages start out inactive, every cache fault will tip * the scan balance towards the file LRU. And as the file LRU * shrinks, so does the window for rotation from references. * This means we have a runaway feedback loop where a tiny * thrashing file LRU becomes infinitely more attractive than * anon pages. Try to detect this based on file LRU size. */ if (!cgroup_reclaim(sc)) { unsigned long total_high_wmark = 0; unsigned long free, anon; int z; struct zone *zone; free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); file = node_page_state(pgdat, NR_ACTIVE_FILE) + node_page_state(pgdat, NR_INACTIVE_FILE); for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) { total_high_wmark += high_wmark_pages(zone); } /* * Consider anon: if that's low too, this isn't a * runaway file reclaim problem, but rather just * extreme pressure. Reclaim as per usual then. */ anon = node_page_state(pgdat, NR_INACTIVE_ANON); sc->file_is_tiny = file + free <= total_high_wmark && !(sc->may_deactivate & DEACTIVATE_ANON) && anon >> sc->priority; } } static inline void calculate_pressure_balance(struct scan_control *sc, int swappiness, u64 *fraction, u64 *denominator) { unsigned long anon_cost, file_cost, total_cost; unsigned long ap, fp; /* * Calculate the pressure balance between anon and file pages. * * The amount of pressure we put on each LRU is inversely * proportional to the cost of reclaiming each list, as * determined by the share of pages that are refaulting, times * the relative IO cost of bringing back a swapped out * anonymous page vs reloading a filesystem page (swappiness). * * Although we limit that influence to ensure no list gets * left behind completely: at least a third of the pressure is * applied, before swappiness. * * With swappiness at 100, anon and file have equal IO cost. */ total_cost = sc->anon_cost + sc->file_cost; anon_cost = total_cost + sc->anon_cost; file_cost = total_cost + sc->file_cost; total_cost = anon_cost + file_cost; ap = swappiness * (total_cost + 1); ap /= anon_cost + 1; fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1); fp /= file_cost + 1; fraction[WORKINGSET_ANON] = ap; fraction[WORKINGSET_FILE] = fp; *denominator = ap + fp; } static unsigned long apply_proportional_protection(struct mem_cgroup *memcg, struct scan_control *sc, unsigned long scan) { unsigned long min, low; mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low); if (min || low) { /* * Scale a cgroup's reclaim pressure by proportioning * its current usage to its memory.low or memory.min * setting. * * This is important, as otherwise scanning aggression * becomes extremely binary -- from nothing as we * approach the memory protection threshold, to totally * nominal as we exceed it. This results in requiring * setting extremely liberal protection thresholds. It * also means we simply get no protection at all if we * set it too low, which is not ideal. * * If there is any protection in place, we reduce scan * pressure by how much of the total memory used is * within protection thresholds. * * There is one special case: in the first reclaim pass, * we skip over all groups that are within their low * protection. If that fails to reclaim enough pages to * satisfy the reclaim goal, we come back and override * the best-effort low protection. However, we still * ideally want to honor how well-behaved groups are in * that case instead of simply punishing them all * equally. As such, we reclaim them based on how much * memory they are using, reducing the scan pressure * again by how much of the total memory used is under * hard protection. */ unsigned long cgroup_size = mem_cgroup_size(memcg); unsigned long protection; /* memory.low scaling, make sure we retry before OOM */ if (!sc->memcg_low_reclaim && low > min) { protection = low; sc->memcg_low_skipped = 1; } else { protection = min; } /* Avoid TOCTOU with earlier protection check */ cgroup_size = max(cgroup_size, protection); scan -= scan * protection / (cgroup_size + 1); /* * Minimally target SWAP_CLUSTER_MAX pages to keep * reclaim moving forwards, avoiding decrementing * sc->priority further than desirable. */ scan = max(scan, SWAP_CLUSTER_MAX); } return scan; } /* * Determine how aggressively the anon and file LRU lists should be * scanned. * * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan */ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec); int swappiness = sc_swappiness(sc, memcg); u64 fraction[ANON_AND_FILE]; u64 denominator = 0; /* gcc */ enum scan_balance scan_balance; enum lru_list lru; /* If we have no swap space, do not bother scanning anon folios. */ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { scan_balance = SCAN_FILE; goto out; } /* * Global reclaim will swap to prevent OOM even with no * swappiness, but memcg users want to use this knob to * disable swapping for individual groups completely when * using the memory controller's swap limit feature would be * too expensive. */ if (cgroup_reclaim(sc) && !swappiness) { scan_balance = SCAN_FILE; goto out; } /* Proactive reclaim initiated by userspace for anonymous memory only */ if (swappiness == SWAPPINESS_ANON_ONLY) { WARN_ON_ONCE(!sc->proactive); scan_balance = SCAN_ANON; goto out; } /* * Do not apply any pressure balancing cleverness when the * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping). */ if (!sc->priority && swappiness) { scan_balance = SCAN_EQUAL; goto out; } /* * If the system is almost out of file pages, force-scan anon. */ if (sc->file_is_tiny) { scan_balance = SCAN_ANON; goto out; } /* * If there is enough inactive page cache, we do not reclaim * anything from the anonymous working right now to make sure * a streaming file access pattern doesn't cause swapping. */ if (sc->cache_trim_mode) { scan_balance = SCAN_FILE; goto out; } scan_balance = SCAN_FRACT; calculate_pressure_balance(sc, swappiness, fraction, &denominator); out: for_each_evictable_lru(lru) { bool file = is_file_lru(lru); unsigned long lruvec_size; unsigned long scan; lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); scan = apply_proportional_protection(memcg, sc, lruvec_size); scan >>= sc->priority; /* * If the cgroup's already been deleted, make sure to * scrape out the remaining cache. */ if (!scan && !mem_cgroup_online(memcg)) scan = min(lruvec_size, SWAP_CLUSTER_MAX); switch (scan_balance) { case SCAN_EQUAL: /* Scan lists relative to size */ break; case SCAN_FRACT: /* * Scan types proportional to swappiness and * their relative recent reclaim efficiency. * Make sure we don't miss the last page on * the offlined memory cgroups because of a * round-off error. */ scan = mem_cgroup_online(memcg) ? div64_u64(scan * fraction[file], denominator) : DIV64_U64_ROUND_UP(scan * fraction[file], denominator); break; case SCAN_FILE: case SCAN_ANON: /* Scan one type exclusively */ if ((scan_balance == SCAN_FILE) != file) scan = 0; break; default: /* Look ma, no brain */ BUG(); } nr[lru] = scan; } } /* * Anonymous LRU management is a waste if there is * ultimately no way to reclaim the memory. */ static bool can_age_anon_pages(struct lruvec *lruvec, struct scan_control *sc) { /* Aging the anon LRU is valuable if swap is present: */ if (total_swap_pages > 0) return true; /* Also valuable if anon pages can be demoted: */ return can_demote(lruvec_pgdat(lruvec)->node_id, sc, lruvec_memcg(lruvec)); } #ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN_ENABLED DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); #define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) #else DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); #define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) #endif static bool should_walk_mmu(void) { return arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK); } static bool should_clear_pmd_young(void) { return arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG); } /****************************************************************************** * shorthand helpers ******************************************************************************/ #define DEFINE_MAX_SEQ(lruvec) \ unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) #define DEFINE_MIN_SEQ(lruvec) \ unsigned long min_seq[ANON_AND_FILE] = { \ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ } /* Get the min/max evictable type based on swappiness */ #define min_type(swappiness) (!(swappiness)) #define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY) #define evictable_min_seq(min_seq, swappiness) \ min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)]) #define for_each_gen_type_zone(gen, type, zone) \ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) #define for_each_evictable_type(type, swappiness) \ for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++) #define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) #define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) { struct pglist_data *pgdat = NODE_DATA(nid); #ifdef CONFIG_MEMCG if (memcg) { struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; /* see the comment in mem_cgroup_lruvec() */ if (!lruvec->pgdat) lruvec->pgdat = pgdat; return lruvec; } #endif VM_WARN_ON_ONCE(!mem_cgroup_disabled()); return &pgdat->__lruvec; } static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) { struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); if (!sc->may_swap) return 0; if (!can_demote(pgdat->node_id, sc, memcg) && mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) return 0; return sc_swappiness(sc, memcg); } static int get_nr_gens(struct lruvec *lruvec, int type) { return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; } static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) { int type; for (type = 0; type < ANON_AND_FILE; type++) { int n = get_nr_gens(lruvec, type); if (n < MIN_NR_GENS || n > MAX_NR_GENS) return false; } return true; } /****************************************************************************** * Bloom filters ******************************************************************************/ /* * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of * bits in a bitmap, k is the number of hash functions and n is the number of * inserted items. * * Page table walkers use one of the two filters to reduce their search space. * To get rid of non-leaf entries that no longer have enough leaf entries, the * aging uses the double-buffering technique to flip to the other filter each * time it produces a new generation. For non-leaf entries that have enough * leaf entries, the aging carries them over to the next generation in * walk_pmd_range(); the eviction also report them when walking the rmap * in lru_gen_look_around(). * * For future optimizations: * 1. It's not necessary to keep both filters all the time. The spare one can be * freed after the RCU grace period and reallocated if needed again. * 2. And when reallocating, it's worth scaling its size according to the number * of inserted entries in the other filter, to reduce the memory overhead on * small systems and false positives on large systems. * 3. Jenkins' hash function is an alternative to Knuth's. */ #define BLOOM_FILTER_SHIFT 15 static inline int filter_gen_from_seq(unsigned long seq) { return seq % NR_BLOOM_FILTERS; } static void get_item_key(void *item, int *key) { u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); key[1] = hash >> BLOOM_FILTER_SHIFT; } static bool test_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, void *item) { int key[2]; unsigned long *filter; int gen = filter_gen_from_seq(seq); filter = READ_ONCE(mm_state->filters[gen]); if (!filter) return true; get_item_key(item, key); return test_bit(key[0], filter) && test_bit(key[1], filter); } static void update_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, void *item) { int key[2]; unsigned long *filter; int gen = filter_gen_from_seq(seq); filter = READ_ONCE(mm_state->filters[gen]); if (!filter) return; get_item_key(item, key); if (!test_bit(key[0], filter)) set_bit(key[0], filter); if (!test_bit(key[1], filter)) set_bit(key[1], filter); } static void reset_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq) { unsigned long *filter; int gen = filter_gen_from_seq(seq); filter = mm_state->filters[gen]; if (filter) { bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); return; } filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); WRITE_ONCE(mm_state->filters[gen], filter); } /****************************************************************************** * mm_struct list ******************************************************************************/ #ifdef CONFIG_LRU_GEN_WALKS_MMU static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) { static struct lru_gen_mm_list mm_list = { .fifo = LIST_HEAD_INIT(mm_list.fifo), .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), }; #ifdef CONFIG_MEMCG if (memcg) return &memcg->mm_list; #endif VM_WARN_ON_ONCE(!mem_cgroup_disabled()); return &mm_list; } static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) { return &lruvec->mm_state; } static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) { int key; struct mm_struct *mm; struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) return NULL; clear_bit(key, &mm->lru_gen.bitmap); return mmget_not_zero(mm) ? mm : NULL; } void lru_gen_add_mm(struct mm_struct *mm) { int nid; struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); #ifdef CONFIG_MEMCG VM_WARN_ON_ONCE(mm->lru_gen.memcg); mm->lru_gen.memcg = memcg; #endif spin_lock(&mm_list->lock); for_each_node_state(nid, N_MEMORY) { struct lruvec *lruvec = get_lruvec(memcg, nid); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* the first addition since the last iteration */ if (mm_state->tail == &mm_list->fifo) mm_state->tail = &mm->lru_gen.list; } list_add_tail(&mm->lru_gen.list, &mm_list->fifo); spin_unlock(&mm_list->lock); } void lru_gen_del_mm(struct mm_struct *mm) { int nid; struct lru_gen_mm_list *mm_list; struct mem_cgroup *memcg = NULL; if (list_empty(&mm->lru_gen.list)) return; #ifdef CONFIG_MEMCG memcg = mm->lru_gen.memcg; #endif mm_list = get_mm_list(memcg); spin_lock(&mm_list->lock); for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* where the current iteration continues after */ if (mm_state->head == &mm->lru_gen.list) mm_state->head = mm_state->head->prev; /* where the last iteration ended before */ if (mm_state->tail == &mm->lru_gen.list) mm_state->tail = mm_state->tail->next; } list_del_init(&mm->lru_gen.list); spin_unlock(&mm_list->lock); #ifdef CONFIG_MEMCG mem_cgroup_put(mm->lru_gen.memcg); mm->lru_gen.memcg = NULL; #endif } #ifdef CONFIG_MEMCG void lru_gen_migrate_mm(struct mm_struct *mm) { struct mem_cgroup *memcg; struct task_struct *task = rcu_dereference_protected(mm->owner, true); VM_WARN_ON_ONCE(task->mm != mm); lockdep_assert_held(&task->alloc_lock); /* for mm_update_next_owner() */ if (mem_cgroup_disabled()) return; /* migration can happen before addition */ if (!mm->lru_gen.memcg) return; rcu_read_lock(); memcg = mem_cgroup_from_task(task); rcu_read_unlock(); if (memcg == mm->lru_gen.memcg) return; VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); lru_gen_del_mm(mm); lru_gen_add_mm(mm); } #endif #else /* !CONFIG_LRU_GEN_WALKS_MMU */ static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) { return NULL; } static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) { return NULL; } static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) { return NULL; } #endif static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last) { int i; int hist; struct lruvec *lruvec = walk->lruvec; struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); hist = lru_hist_from_seq(walk->seq); for (i = 0; i < NR_MM_STATS; i++) { WRITE_ONCE(mm_state->stats[hist][i], mm_state->stats[hist][i] + walk->mm_stats[i]); walk->mm_stats[i] = 0; } if (NR_HIST_GENS > 1 && last) { hist = lru_hist_from_seq(walk->seq + 1); for (i = 0; i < NR_MM_STATS; i++) WRITE_ONCE(mm_state->stats[hist][i], 0); } } static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **iter) { bool first = false; bool last = false; struct mm_struct *mm = NULL; struct lruvec *lruvec = walk->lruvec; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* * mm_state->seq is incremented after each iteration of mm_list. There * are three interesting cases for this page table walker: * 1. It tries to start a new iteration with a stale max_seq: there is * nothing left to do. * 2. It started the next iteration: it needs to reset the Bloom filter * so that a fresh set of PTE tables can be recorded. * 3. It ended the current iteration: it needs to reset the mm stats * counters and tell its caller to increment max_seq. */ spin_lock(&mm_list->lock); VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq); if (walk->seq <= mm_state->seq) goto done; if (!mm_state->head) mm_state->head = &mm_list->fifo; if (mm_state->head == &mm_list->fifo) first = true; do { mm_state->head = mm_state->head->next; if (mm_state->head == &mm_list->fifo) { WRITE_ONCE(mm_state->seq, mm_state->seq + 1); last = true; break; } /* force scan for those added after the last iteration */ if (!mm_state->tail || mm_state->tail == mm_state->head) { mm_state->tail = mm_state->head->next; walk->force_scan = true; } } while (!(mm = get_next_mm(walk))); done: if (*iter || last) reset_mm_stats(walk, last); spin_unlock(&mm_list->lock); if (mm && first) reset_bloom_filter(mm_state, walk->seq + 1); if (*iter) mmput_async(*iter); *iter = mm; return last; } static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq) { bool success = false; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); spin_lock(&mm_list->lock); VM_WARN_ON_ONCE(mm_state->seq + 1 < seq); if (seq > mm_state->seq) { mm_state->head = NULL; mm_state->tail = NULL; WRITE_ONCE(mm_state->seq, mm_state->seq + 1); success = true; } spin_unlock(&mm_list->lock); return success; } /****************************************************************************** * PID controller ******************************************************************************/ /* * A feedback loop based on Proportional-Integral-Derivative (PID) controller. * * The P term is refaulted/(evicted+protected) from a tier in the generation * currently being evicted; the I term is the exponential moving average of the * P term over the generations previously evicted, using the smoothing factor * 1/2; the D term isn't supported. * * The setpoint (SP) is always the first tier of one type; the process variable * (PV) is either any tier of the other type or any other tier of the same * type. * * The error is the difference between the SP and the PV; the correction is to * turn off protection when SP>PV or turn on protection when SP<PV. * * For future optimizations: * 1. The D term may discount the other two terms over time so that long-lived * generations can resist stale information. */ struct ctrl_pos { unsigned long refaulted; unsigned long total; int gain; }; static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, struct ctrl_pos *pos) { int i; struct lru_gen_folio *lrugen = &lruvec->lrugen; int hist = lru_hist_from_seq(lrugen->min_seq[type]); pos->gain = gain; pos->refaulted = pos->total = 0; for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) { pos->refaulted += lrugen->avg_refaulted[type][i] + atomic_long_read(&lrugen->refaulted[hist][type][i]); pos->total += lrugen->avg_total[type][i] + lrugen->protected[hist][type][i] + atomic_long_read(&lrugen->evicted[hist][type][i]); } } static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) { int hist, tier; struct lru_gen_folio *lrugen = &lruvec->lrugen; bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; lockdep_assert_held(&lruvec->lru_lock); if (!carryover && !clear) return; hist = lru_hist_from_seq(seq); for (tier = 0; tier < MAX_NR_TIERS; tier++) { if (carryover) { unsigned long sum; sum = lrugen->avg_refaulted[type][tier] + atomic_long_read(&lrugen->refaulted[hist][type][tier]); WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); sum = lrugen->avg_total[type][tier] + lrugen->protected[hist][type][tier] + atomic_long_read(&lrugen->evicted[hist][type][tier]); WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); } if (clear) { atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); atomic_long_set(&lrugen->evicted[hist][type][tier], 0); WRITE_ONCE(lrugen->protected[hist][type][tier], 0); } } } static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) { /* * Return true if the PV has a limited number of refaults or a lower * refaulted/total than the SP. */ return pv->refaulted < MIN_LRU_BATCH || pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= (sp->refaulted + 1) * pv->total * pv->gain; } /****************************************************************************** * the aging ******************************************************************************/ /* promote pages accessed through page tables */ static int folio_update_gen(struct folio *folio, int gen) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return -1; } do { /* lru_gen_del_folio() has isolated this page? */ if (!(old_flags & LRU_GEN_MASK)) return -1; new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset); } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } /* protect pages accessed multiple times through file descriptors */ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); do { new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; /* folio_update_gen() has promoted this page? */ if (new_gen >= 0 && new_gen != old_gen) return new_gen; new_gen = (old_gen + 1) % MAX_NR_GENS; new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; /* for folio_end_writeback() */ if (reclaiming) new_flags |= BIT(PG_reclaim); } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); lru_gen_update_size(lruvec, folio, old_gen, new_gen); return new_gen; } static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, int old_gen, int new_gen) { int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS); walk->batched++; walk->nr_pages[old_gen][type][zone] -= delta; walk->nr_pages[new_gen][type][zone] += delta; } static void reset_batch_size(struct lru_gen_mm_walk *walk) { int gen, type, zone; struct lruvec *lruvec = walk->lruvec; struct lru_gen_folio *lrugen = &lruvec->lrugen; walk->batched = 0; for_each_gen_type_zone(gen, type, zone) { enum lru_list lru = type * LRU_INACTIVE_FILE; int delta = walk->nr_pages[gen][type][zone]; if (!delta) continue; walk->nr_pages[gen][type][zone] = 0; WRITE_ONCE(lrugen->nr_pages[gen][type][zone], lrugen->nr_pages[gen][type][zone] + delta); if (lru_gen_is_active(lruvec, gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, delta); } } static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) { struct address_space *mapping; struct vm_area_struct *vma = args->vma; struct lru_gen_mm_walk *walk = args->private; if (!vma_is_accessible(vma)) return true; if (is_vm_hugetlb_page(vma)) return true; if (!vma_has_recency(vma)) return true; if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) return true; if (vma == get_gate_vma(vma->vm_mm)) return true; if (vma_is_anonymous(vma)) return !walk->swappiness; if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) return true; mapping = vma->vm_file->f_mapping; if (mapping_unevictable(mapping)) return true; if (shmem_mapping(mapping)) return !walk->swappiness; if (walk->swappiness > MAX_SWAPPINESS) return true; /* to exclude special mappings like dax, etc. */ return !mapping->a_ops->read_folio; } /* * Some userspace memory allocators map many single-page VMAs. Instead of * returning back to the PGD table for each of such VMAs, finish an entire PMD * table to reduce zigzags and improve cache performance. */ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, unsigned long *vm_start, unsigned long *vm_end) { unsigned long start = round_up(*vm_end, size); unsigned long end = (start | ~mask) + 1; VMA_ITERATOR(vmi, args->mm, start); VM_WARN_ON_ONCE(mask & size); VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); for_each_vma(vmi, args->vma) { if (end && end <= args->vma->vm_start) return false; if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) continue; *vm_start = max(start, args->vma->vm_start); *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; return true; } return false; } static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr, struct pglist_data *pgdat) { unsigned long pfn = pte_pfn(pte); VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); if (!pte_present(pte) || is_zero_pfn(pfn)) return -1; if (WARN_ON_ONCE(pte_special(pte))) return -1; if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm)) return -1; if (WARN_ON_ONCE(!pfn_valid(pfn))) return -1; if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) return -1; return pfn; } static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr, struct pglist_data *pgdat) { unsigned long pfn = pmd_pfn(pmd); VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) return -1; if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm)) return -1; if (WARN_ON_ONCE(!pfn_valid(pfn))) return -1; if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) return -1; return pfn; } static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, struct pglist_data *pgdat) { struct folio *folio = pfn_folio(pfn); if (folio_lru_gen(folio) < 0) return NULL; if (folio_nid(folio) != pgdat->node_id) return NULL; if (folio_memcg(folio) != memcg) return NULL; return folio; } static bool suitable_to_scan(int total, int young) { int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); /* suitable if the average number of young PTEs per cacheline is >=1 */ return young * n >= total; } static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio, int new_gen, bool dirty) { int old_gen; if (!folio) return; if (dirty && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); if (walk) { old_gen = folio_update_gen(folio, new_gen); if (old_gen >= 0 && old_gen != new_gen) update_batch_size(walk, folio, old_gen, new_gen); } else if (lru_gen_set_refs(folio)) { old_gen = folio_lru_gen(folio); if (old_gen >= 0 && old_gen != new_gen) folio_activate(folio); } } static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *args) { int i; bool dirty; pte_t *pte; spinlock_t *ptl; unsigned long addr; int total = 0; int young = 0; struct folio *last = NULL; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); int gen = lru_gen_from_seq(max_seq); pmd_t pmdval; pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl); if (!pte) return false; if (!spin_trylock(ptl)) { pte_unmap(pte); return true; } if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) { pte_unmap_unlock(pte, ptl); return false; } arch_enter_lazy_mmu_mode(); restart: for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; struct folio *folio; pte_t ptent = ptep_get(pte + i); total++; walk->mm_stats[MM_LEAF_TOTAL]++; pfn = get_pte_pfn(ptent, args->vma, addr, pgdat); if (pfn == -1) continue; folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) continue; if (!ptep_clear_young_notify(args->vma, addr, pte + i)) continue; if (last != folio) { walk_update_folio(walk, last, gen, dirty); last = folio; dirty = false; } if (pte_dirty(ptent)) dirty = true; young++; walk->mm_stats[MM_LEAF_YOUNG]++; } walk_update_folio(walk, last, gen, dirty); last = NULL; if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) goto restart; arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte, ptl); return suitable_to_scan(total, young); } static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { int i; bool dirty; pmd_t *pmd; spinlock_t *ptl; struct folio *last = NULL; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); int gen = lru_gen_from_seq(max_seq); VM_WARN_ON_ONCE(pud_leaf(*pud)); /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ if (*first == -1) { *first = addr; bitmap_zero(bitmap, MIN_LRU_BATCH); return; } i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first); if (i && i <= MIN_LRU_BATCH) { __set_bit(i - 1, bitmap); return; } pmd = pmd_offset(pud, *first); ptl = pmd_lockptr(args->mm, pmd); if (!spin_trylock(ptl)) goto done; arch_enter_lazy_mmu_mode(); do { unsigned long pfn; struct folio *folio; /* don't round down the first address */ addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; if (!pmd_present(pmd[i])) goto next; if (!pmd_trans_huge(pmd[i])) { if (!walk->force_scan && should_clear_pmd_young() && !mm_has_notifiers(args->mm)) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; } pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat); if (pfn == -1) goto next; folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) goto next; if (!pmdp_clear_young_notify(vma, addr, pmd + i)) goto next; if (last != folio) { walk_update_folio(walk, last, gen, dirty); last = folio; dirty = false; } if (pmd_dirty(pmd[i])) dirty = true; walk->mm_stats[MM_LEAF_YOUNG]++; next: i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; } while (i <= MIN_LRU_BATCH); walk_update_folio(walk, last, gen, dirty); arch_leave_lazy_mmu_mode(); spin_unlock(ptl); done: *first = -1; } static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, struct mm_walk *args) { int i; pmd_t *pmd; unsigned long next; unsigned long addr; struct vm_area_struct *vma; DECLARE_BITMAP(bitmap, MIN_LRU_BATCH); unsigned long first = -1; struct lru_gen_mm_walk *walk = args->private; struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); VM_WARN_ON_ONCE(pud_leaf(*pud)); /* * Finish an entire PMD in two passes: the first only reaches to PTE * tables to avoid taking the PMD lock; the second, if necessary, takes * the PMD lock to clear the accessed bit in PMD entries. */ pmd = pmd_offset(pud, start & PUD_MASK); restart: /* walk_pte_range() may call get_next_vma() */ vma = args->vma; for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { pmd_t val = pmdp_get_lockless(pmd + i); next = pmd_addr_end(addr, end); if (!pmd_present(val) || is_huge_zero_pmd(val)) { walk->mm_stats[MM_LEAF_TOTAL]++; continue; } if (pmd_trans_huge(val)) { struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat); walk->mm_stats[MM_LEAF_TOTAL]++; if (pfn != -1) walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); continue; } if (!walk->force_scan && should_clear_pmd_young() && !mm_has_notifiers(args->mm)) { if (!pmd_young(val)) continue; walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); } if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i)) continue; walk->mm_stats[MM_NONLEAF_FOUND]++; if (!walk_pte_range(&val, addr, next, args)) continue; walk->mm_stats[MM_NONLEAF_ADDED]++; /* carry over to the next generation */ update_bloom_filter(mm_state, walk->seq + 1, pmd + i); } walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) goto restart; } static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, struct mm_walk *args) { int i; pud_t *pud; unsigned long addr; unsigned long next; struct lru_gen_mm_walk *walk = args->private; VM_WARN_ON_ONCE(p4d_leaf(*p4d)); pud = pud_offset(p4d, start & P4D_MASK); restart: for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { pud_t val = pudp_get(pud + i); next = pud_addr_end(addr, end); if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) continue; walk_pmd_range(&val, addr, next, args); if (need_resched() || walk->batched >= MAX_LRU_BATCH) { end = (addr | ~PUD_MASK) + 1; goto done; } } if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end)) goto restart; end = round_up(end, P4D_SIZE); done: if (!end || !args->vma) return 1; walk->next_addr = max(end, args->vma->vm_start); return -EAGAIN; } static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) { static const struct mm_walk_ops mm_walk_ops = { .test_walk = should_skip_vma, .p4d_entry = walk_pud_range, .walk_lock = PGWALK_RDLOCK, }; int err; struct lruvec *lruvec = walk->lruvec; walk->next_addr = FIRST_USER_ADDRESS; do { DEFINE_MAX_SEQ(lruvec); err = -EBUSY; /* another thread might have called inc_max_seq() */ if (walk->seq != max_seq) break; /* the caller might be holding the lock for write */ if (mmap_read_trylock(mm)) { err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); mmap_read_unlock(mm); } if (walk->batched) { spin_lock_irq(&lruvec->lru_lock); reset_batch_size(walk); spin_unlock_irq(&lruvec->lru_lock); } cond_resched(); } while (err == -EAGAIN); } static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) { struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; if (pgdat && current_is_kswapd()) { VM_WARN_ON_ONCE(walk); walk = &pgdat->mm_walk; } else if (!walk && force_alloc) { VM_WARN_ON_ONCE(current_is_kswapd()); walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); } current->reclaim_state->mm_walk = walk; return walk; } static void clear_mm_walk(void) { struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); current->reclaim_state->mm_walk = NULL; if (!current_is_kswapd()) kfree(walk); } static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) { int zone; int remaining = MAX_LRU_BATCH; struct lru_gen_folio *lrugen = &lruvec->lrugen; int hist = lru_hist_from_seq(lrugen->min_seq[type]); int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); /* For file type, skip the check if swappiness is anon only */ if (type && (swappiness == SWAPPINESS_ANON_ONLY)) goto done; /* For anon type, skip the check if swappiness is zero (file only) */ if (!type && !swappiness) goto done; /* prevent cold/hot inversion if the type is evictable */ for (zone = 0; zone < MAX_NR_ZONES; zone++) { struct list_head *head = &lrugen->folios[old_gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); int refs = folio_lru_refs(folio); bool workingset = folio_test_workingset(folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); new_gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); /* don't count the workingset being lazily promoted */ if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { int tier = lru_tier_from_refs(refs, workingset); int delta = folio_nr_pages(folio); WRITE_ONCE(lrugen->protected[hist][type][tier], lrugen->protected[hist][type][tier] + delta); } if (!--remaining) return false; } } done: reset_ctrl_pos(lruvec, type, true); WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); return true; } static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) { int gen, type, zone; bool success = false; bool seq_inc_flag = false; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); /* find the oldest populated generation */ for_each_evictable_type(type, swappiness) { while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { gen = lru_gen_from_seq(min_seq[type]); for (zone = 0; zone < MAX_NR_ZONES; zone++) { if (!list_empty(&lrugen->folios[gen][type][zone])) goto next; } min_seq[type]++; seq_inc_flag = true; } next: ; } /* * If min_seq[type] of both anonymous and file is not increased, * we can directly return false to avoid unnecessary checking * overhead later. */ if (!seq_inc_flag) return success; /* see the comment on lru_gen_folio */ if (swappiness && swappiness <= MAX_SWAPPINESS) { unsigned long seq = lrugen->max_seq - MIN_NR_GENS; if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq) min_seq[LRU_GEN_ANON] = seq; else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq) min_seq[LRU_GEN_FILE] = seq; } for_each_evictable_type(type, swappiness) { if (min_seq[type] <= lrugen->min_seq[type]) continue; reset_ctrl_pos(lruvec, type, true); WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); success = true; } return success; } static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness) { bool success; int prev, next; int type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; restart: if (seq < READ_ONCE(lrugen->max_seq)) return false; spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); success = seq == lrugen->max_seq; if (!success) goto unlock; for (type = 0; type < ANON_AND_FILE; type++) { if (get_nr_gens(lruvec, type) != MAX_NR_GENS) continue; if (inc_min_seq(lruvec, type, swappiness)) continue; spin_unlock_irq(&lruvec->lru_lock); cond_resched(); goto restart; } /* * Update the active/inactive LRU sizes for compatibility. Both sides of * the current max_seq need to be covered, since max_seq+1 can overlap * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do * overlap, cold/hot inversion happens. */ prev = lru_gen_from_seq(lrugen->max_seq - 1); next = lru_gen_from_seq(lrugen->max_seq + 1); for (type = 0; type < ANON_AND_FILE; type++) { for (zone = 0; zone < MAX_NR_ZONES; zone++) { enum lru_list lru = type * LRU_INACTIVE_FILE; long delta = lrugen->nr_pages[prev][type][zone] - lrugen->nr_pages[next][type][zone]; if (!delta) continue; __update_lru_size(lruvec, lru, zone, delta); __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); } } for (type = 0; type < ANON_AND_FILE; type++) reset_ctrl_pos(lruvec, type, false); WRITE_ONCE(lrugen->timestamps[next], jiffies); /* make sure preceding modifications appear */ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); unlock: spin_unlock_irq(&lruvec->lru_lock); return success; } static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness, bool force_scan) { bool success; struct lru_gen_mm_walk *walk; struct mm_struct *mm = NULL; struct lru_gen_folio *lrugen = &lruvec->lrugen; struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq)); if (!mm_state) return inc_max_seq(lruvec, seq, swappiness); /* see the comment in iterate_mm_list() */ if (seq <= READ_ONCE(mm_state->seq)) return false; /* * If the hardware doesn't automatically set the accessed bit, fallback * to lru_gen_look_around(), which only clears the accessed bit in a * handful of PTEs. Spreading the work out over a period of time usually * is less efficient, but it avoids bursty page faults. */ if (!should_walk_mmu()) { success = iterate_mm_list_nowalk(lruvec, seq); goto done; } walk = set_mm_walk(NULL, true); if (!walk) { success = iterate_mm_list_nowalk(lruvec, seq); goto done; } walk->lruvec = lruvec; walk->seq = seq; walk->swappiness = swappiness; walk->force_scan = force_scan; do { success = iterate_mm_list(walk, &mm); if (mm) walk_mm(mm, walk); } while (mm); done: if (success) { success = inc_max_seq(lruvec, seq, swappiness); WARN_ON_ONCE(!success); } return success; } /****************************************************************************** * working set protection ******************************************************************************/ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) { int priority; unsigned long reclaimable; if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) return; /* * Determine the initial priority based on * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, * where reclaimed_to_scanned_ratio = inactive / total. */ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); /* round down reclaimable and round up sc->nr_to_reclaim */ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); /* * The estimation is based on LRU pages only, so cap it to prevent * overshoots of shrinker objects by large margins. */ sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY); } static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { int gen, type, zone; unsigned long total = 0; int swappiness = get_swappiness(lruvec, sc); struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); for_each_evictable_type(type, swappiness) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { gen = lru_gen_from_seq(seq); for (zone = 0; zone < MAX_NR_ZONES; zone++) total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); } } /* whether the size is big enough to be helpful */ return mem_cgroup_online(memcg) ? (total >> sc->priority) : total; } static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) { int gen; unsigned long birth; int swappiness = get_swappiness(lruvec, sc); struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec); if (mem_cgroup_below_min(NULL, memcg)) return false; if (!lruvec_is_sizable(lruvec, sc)) return false; gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness)); birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); return time_is_before_jiffies(birth + min_ttl); } /* to protect the working set of the last N jiffies */ static unsigned long lru_gen_min_ttl __read_mostly; static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); bool reclaimable = !min_ttl; VM_WARN_ON_ONCE(!current_is_kswapd()); set_initial_priority(pgdat, sc); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); mem_cgroup_calculate_protection(NULL, memcg); if (!reclaimable) reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); /* * The main goal is to OOM kill if every generation from all memcgs is * younger than min_ttl. However, another possibility is all memcgs are * either too small or below min. */ if (!reclaimable && mutex_trylock(&oom_lock)) { struct oom_control oc = { .gfp_mask = sc->gfp_mask, }; out_of_memory(&oc); mutex_unlock(&oom_lock); } } /****************************************************************************** * rmap/PT walk feedback ******************************************************************************/ /* * This function exploits spatial locality when shrink_folio_list() walks the * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If * the scan was done cacheline efficiently, it adds the PMD entry pointing to * the PTE table to the Bloom filter. This forms a feedback loop between the * eviction and the aging. */ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; bool dirty; unsigned long start; unsigned long end; struct lru_gen_mm_walk *walk; struct folio *last = NULL; int young = 1; pte_t *pte = pvmw->pte; unsigned long addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; struct folio *folio = pfn_folio(pvmw->pfn); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); DEFINE_MAX_SEQ(lruvec); int gen = lru_gen_from_seq(max_seq); lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); if (!ptep_clear_young_notify(vma, addr, pte)) return false; if (spin_is_contended(pvmw->ptl)) return true; /* exclude special VMAs containing anon pages from COW */ if (vma->vm_flags & VM_SPECIAL) return true; /* avoid taking the LRU lock under the PTL when possible */ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; start = max(addr & PMD_MASK, vma->vm_start); end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1; if (end - start == PAGE_SIZE) return true; if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) end = start + MIN_LRU_BATCH * PAGE_SIZE; else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2) start = end - MIN_LRU_BATCH * PAGE_SIZE; else { start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2; end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2; } } arch_enter_lazy_mmu_mode(); pte -= (addr - start) / PAGE_SIZE; for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; pte_t ptent = ptep_get(pte + i); pfn = get_pte_pfn(ptent, vma, addr, pgdat); if (pfn == -1) continue; folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) continue; if (!ptep_clear_young_notify(vma, addr, pte + i)) continue; if (last != folio) { walk_update_folio(walk, last, gen, dirty); last = folio; dirty = false; } if (pte_dirty(ptent)) dirty = true; young++; } walk_update_folio(walk, last, gen, dirty); arch_leave_lazy_mmu_mode(); /* feedback from rmap walkers to page table walkers */ if (mm_state && suitable_to_scan(i, young)) update_bloom_filter(mm_state, max_seq, pvmw->pmd); return true; } /****************************************************************************** * memcg LRU ******************************************************************************/ /* see the comment on MEMCG_NR_GENS */ enum { MEMCG_LRU_NOP, MEMCG_LRU_HEAD, MEMCG_LRU_TAIL, MEMCG_LRU_OLD, MEMCG_LRU_YOUNG, }; static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) { int seg; int old, new; unsigned long flags; int bin = get_random_u32_below(MEMCG_NR_BINS); struct pglist_data *pgdat = lruvec_pgdat(lruvec); spin_lock_irqsave(&pgdat->memcg_lru.lock, flags); VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); seg = 0; new = old = lruvec->lrugen.gen; /* see the comment on MEMCG_NR_GENS */ if (op == MEMCG_LRU_HEAD) seg = MEMCG_LRU_HEAD; else if (op == MEMCG_LRU_TAIL) seg = MEMCG_LRU_TAIL; else if (op == MEMCG_LRU_OLD) new = get_memcg_gen(pgdat->memcg_lru.seq); else if (op == MEMCG_LRU_YOUNG) new = get_memcg_gen(pgdat->memcg_lru.seq + 1); else VM_WARN_ON_ONCE(true); WRITE_ONCE(lruvec->lrugen.seg, seg); WRITE_ONCE(lruvec->lrugen.gen, new); hlist_nulls_del_rcu(&lruvec->lrugen.list); if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); else hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); pgdat->memcg_lru.nr_memcgs[old]--; pgdat->memcg_lru.nr_memcgs[new]++; if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags); } #ifdef CONFIG_MEMCG void lru_gen_online_memcg(struct mem_cgroup *memcg) { int gen; int nid; int bin = get_random_u32_below(MEMCG_NR_BINS); for_each_node(nid) { struct pglist_data *pgdat = NODE_DATA(nid); struct lruvec *lruvec = get_lruvec(memcg, nid); spin_lock_irq(&pgdat->memcg_lru.lock); VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); gen = get_memcg_gen(pgdat->memcg_lru.seq); lruvec->lrugen.gen = gen; hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); pgdat->memcg_lru.nr_memcgs[gen]++; spin_unlock_irq(&pgdat->memcg_lru.lock); } } void lru_gen_offline_memcg(struct mem_cgroup *memcg) { int nid; for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); } } void lru_gen_release_memcg(struct mem_cgroup *memcg) { int gen; int nid; for_each_node(nid) { struct pglist_data *pgdat = NODE_DATA(nid); struct lruvec *lruvec = get_lruvec(memcg, nid); spin_lock_irq(&pgdat->memcg_lru.lock); if (hlist_nulls_unhashed(&lruvec->lrugen.list)) goto unlock; gen = lruvec->lrugen.gen; hlist_nulls_del_init_rcu(&lruvec->lrugen.list); pgdat->memcg_lru.nr_memcgs[gen]--; if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); unlock: spin_unlock_irq(&pgdat->memcg_lru.lock); } } void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); /* see the comment on MEMCG_NR_GENS */ if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_HEAD) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); } #endif /* CONFIG_MEMCG */ /****************************************************************************** * the eviction ******************************************************************************/ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc, int tier_idx) { bool success; bool dirty, writeback; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); bool workingset = folio_test_workingset(folio); int tier = lru_tier_from_refs(refs, workingset); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); /* unevictable */ if (!folio_evictable(folio)) { success = lru_gen_del_folio(lruvec, folio, true); VM_WARN_ON_ONCE_FOLIO(!success, folio); folio_set_unevictable(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGCULLED, delta); return true; } /* promoted */ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } /* protected */ if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) { gen = folio_inc_gen(lruvec, folio, false); list_move(&folio->lru, &lrugen->folios[gen][type][zone]); /* don't count the workingset being lazily promoted */ if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { int hist = lru_hist_from_seq(lrugen->min_seq[type]); WRITE_ONCE(lrugen->protected[hist][type][tier], lrugen->protected[hist][type][tier] + delta); } return true; } /* ineligible */ if (zone > sc->reclaim_idx) { gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } dirty = folio_test_dirty(folio); writeback = folio_test_writeback(folio); if (type == LRU_GEN_FILE && dirty) { sc->nr.file_taken += delta; if (!writeback) sc->nr.unqueued_dirty += delta; } /* waiting for writeback */ if (writeback || (type == LRU_GEN_FILE && dirty)) { gen = folio_inc_gen(lruvec, folio, true); list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } return false; } static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc) { bool success; /* swap constrained */ if (!(sc->gfp_mask & __GFP_IO) && (folio_test_dirty(folio) || (folio_test_anon(folio) && !folio_test_swapcache(folio)))) return false; /* raced with release_pages() */ if (!folio_try_get(folio)) return false; /* raced with another isolation */ if (!folio_test_clear_lru(folio)) { folio_put(folio); return false; } /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0); /* for shrink_folio_list() */ folio_clear_reclaim(folio); success = lru_gen_del_folio(lruvec, folio, true); VM_WARN_ON_ONCE_FOLIO(!success, folio); return true; } static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, int type, int tier, struct list_head *list) { int i; int gen; enum vm_event_item item; int sorted = 0; int scanned = 0; int isolated = 0; int skipped = 0; int scan_batch = min(nr_to_scan, MAX_LRU_BATCH); int remaining = scan_batch; struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); VM_WARN_ON_ONCE(!list_empty(list)); if (get_nr_gens(lruvec, type) == MIN_NR_GENS) return 0; gen = lru_gen_from_seq(lrugen->min_seq[type]); for (i = MAX_NR_ZONES; i > 0; i--) { LIST_HEAD(moved); int skipped_zone = 0; int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES; struct list_head *head = &lrugen->folios[gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); int delta = folio_nr_pages(folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); scanned += delta; if (sort_folio(lruvec, folio, sc, tier)) sorted += delta; else if (isolate_folio(lruvec, folio, sc)) { list_add(&folio->lru, list); isolated += delta; } else { list_move(&folio->lru, &moved); skipped_zone += delta; } if (!--remaining || max(isolated, skipped_zone) >= MIN_LRU_BATCH) break; } if (skipped_zone) { list_splice(&moved, head); __count_zid_vm_events(PGSCAN_SKIP, zone, skipped_zone); skipped += skipped_zone; } if (!remaining || isolated >= MIN_LRU_BATCH) break; } item = PGSCAN_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) { __count_vm_events(item, isolated); __count_vm_events(PGREFILL, sorted); } count_memcg_events(memcg, item, isolated); count_memcg_events(memcg, PGREFILL, sorted); __count_vm_events(PGSCAN_ANON + type, isolated); trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, scan_batch, scanned, skipped, isolated, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); if (type == LRU_GEN_FILE) sc->nr.file_taken += isolated; /* * There might not be eligible folios due to reclaim_idx. Check the * remaining to prevent livelock if it's not making progress. */ return isolated || !remaining ? scanned : 0; } static int get_tier_idx(struct lruvec *lruvec, int type) { int tier; struct ctrl_pos sp, pv; /* * To leave a margin for fluctuations, use a larger gain factor (2:3). * This value is chosen because any other tier would have at least twice * as many refaults as the first tier. */ read_ctrl_pos(lruvec, type, 0, 2, &sp); for (tier = 1; tier < MAX_NR_TIERS; tier++) { read_ctrl_pos(lruvec, type, tier, 3, &pv); if (!positive_ctrl_err(&sp, &pv)) break; } return tier - 1; } static int get_type_to_scan(struct lruvec *lruvec, int swappiness) { struct ctrl_pos sp, pv; if (swappiness <= MIN_SWAPPINESS + 1) return LRU_GEN_FILE; if (swappiness >= MAX_SWAPPINESS) return LRU_GEN_ANON; /* * Compare the sum of all tiers of anon with that of file to determine * which type to scan. */ read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp); read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv); return positive_ctrl_err(&sp, &pv); } static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, int swappiness, int *type_scanned, struct list_head *list) { int i; int type = get_type_to_scan(lruvec, swappiness); for_each_evictable_type(i, swappiness) { int scanned; int tier = get_tier_idx(lruvec, type); *type_scanned = type; scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list); if (scanned) return scanned; type = !type; } return 0; } static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, int swappiness) { int type; int scanned; int reclaimed; LIST_HEAD(list); LIST_HEAD(clean); struct folio *folio; struct folio *next; enum vm_event_item item; struct reclaim_stat stat; struct lru_gen_mm_walk *walk; bool skip_retry = false; struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); spin_lock_irq(&lruvec->lru_lock); scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list); scanned += try_to_inc_min_seq(lruvec, swappiness); if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) scanned = 0; spin_unlock_irq(&lruvec->lru_lock); if (list_empty(&list)) return scanned; retry: reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg); sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr_reclaimed += reclaimed; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, scanned, reclaimed, &stat, sc->priority, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); list_for_each_entry_safe_reverse(folio, next, &list, lru) { DEFINE_MIN_SEQ(lruvec); if (!folio_evictable(folio)) { list_del(&folio->lru); folio_putback_lru(folio); continue; } /* retry folios that may have missed folio_rotate_reclaimable() */ if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) && !folio_test_dirty(folio) && !folio_test_writeback(folio)) { list_move(&folio->lru, &clean); continue; } /* don't add rejected folios to the oldest generation */ if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active)); } spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &list); walk = current->reclaim_state->mm_walk; if (walk && walk->batched) { walk->lruvec = lruvec; reset_batch_size(walk); } mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), stat.nr_demoted); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, reclaimed); count_memcg_events(memcg, item, reclaimed); __count_vm_events(PGSTEAL_ANON + type, reclaimed); spin_unlock_irq(&lruvec->lru_lock); list_splice_init(&clean, &list); if (!list_empty(&list)) { skip_retry = true; goto retry; } return scanned; } static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, int swappiness, unsigned long *nr_to_scan) { int gen, type, zone; unsigned long size = 0; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); *nr_to_scan = 0; /* have to run aging, since eviction is not possible anymore */ if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq) return true; for_each_evictable_type(type, swappiness) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { gen = lru_gen_from_seq(seq); for (zone = 0; zone < MAX_NR_ZONES; zone++) size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); } } *nr_to_scan = size; /* better to run aging even though eviction is still possible */ return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq; } /* * For future optimizations: * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg * reclaim. */ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) { bool success; unsigned long nr_to_scan; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) return -1; success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan); /* try to scrape all its memory if this memcg was deleted */ if (nr_to_scan && !mem_cgroup_online(memcg)) return nr_to_scan; nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan); /* try to get away with not aging at the default priority */ if (!success || sc->priority == DEF_PRIORITY) return nr_to_scan >> sc->priority; /* stop scanning this lruvec as it's low on cold folios */ return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0; } static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) { int i; enum zone_watermarks mark; /* don't abort memcg reclaim to ensure fairness */ if (!root_reclaim(sc)) return false; if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order))) return true; /* check the order to exclude compaction-induced reclaim */ if (!current_is_kswapd() || sc->order) return false; mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ? WMARK_PROMO : WMARK_HIGH; for (i = 0; i <= sc->reclaim_idx; i++) { struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; unsigned long size = wmark_pages(zone, mark) + MIN_LRU_BATCH; if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0)) return false; } /* kswapd should abort if all eligible zones are safe */ return true; } static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { long nr_to_scan; unsigned long scanned = 0; int swappiness = get_swappiness(lruvec, sc); while (true) { int delta; nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); if (nr_to_scan <= 0) break; delta = evict_folios(nr_to_scan, lruvec, sc, swappiness); if (!delta) break; scanned += delta; if (scanned >= nr_to_scan) break; if (should_abort_scan(lruvec, sc)) break; cond_resched(); } /* * If too many file cache in the coldest generation can't be evicted * due to being dirty, wake up the flusher. */ if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) wakeup_flusher_threads(WB_REASON_VMSCAN); /* whether this lruvec should be rotated */ return nr_to_scan < 0; } static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) { bool success; unsigned long scanned = sc->nr_scanned; unsigned long reclaimed = sc->nr_reclaimed; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); /* lru_gen_age_node() called mem_cgroup_calculate_protection() */ if (mem_cgroup_below_min(NULL, memcg)) return MEMCG_LRU_YOUNG; if (mem_cgroup_below_low(NULL, memcg)) { /* see the comment on MEMCG_NR_GENS */ if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL) return MEMCG_LRU_TAIL; memcg_memory_event(memcg, MEMCG_LOW); } success = try_to_shrink_lruvec(lruvec, sc); shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); if (!sc->proactive) vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); flush_reclaim_state(sc); if (success && mem_cgroup_online(memcg)) return MEMCG_LRU_YOUNG; if (!success && lruvec_is_sizable(lruvec, sc)) return 0; /* one retry if offlined or too small */ return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; } static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) { int op; int gen; int bin; int first_bin; struct lruvec *lruvec; struct lru_gen_folio *lrugen; struct mem_cgroup *memcg; struct hlist_nulls_node *pos; gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); restart: op = 0; memcg = NULL; rcu_read_lock(); hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { if (op) { lru_gen_rotate_memcg(lruvec, op); op = 0; } mem_cgroup_put(memcg); memcg = NULL; if (gen != READ_ONCE(lrugen->gen)) continue; lruvec = container_of(lrugen, struct lruvec, lrugen); memcg = lruvec_memcg(lruvec); if (!mem_cgroup_tryget(memcg)) { lru_gen_release_memcg(memcg); memcg = NULL; continue; } rcu_read_unlock(); op = shrink_one(lruvec, sc); rcu_read_lock(); if (should_abort_scan(lruvec, sc)) break; } rcu_read_unlock(); if (op) lru_gen_rotate_memcg(lruvec, op); mem_cgroup_put(memcg); if (!is_a_nulls(pos)) return; /* restart if raced with lru_gen_rotate_memcg() */ if (gen != get_nulls_value(pos)) goto restart; /* try the rest of the bins of the current generation */ bin = get_memcg_bin(bin + 1); if (bin != first_bin) goto restart; } static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { struct blk_plug plug; VM_WARN_ON_ONCE(root_reclaim(sc)); VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap); lru_add_drain(); blk_start_plug(&plug); set_mm_walk(NULL, sc->proactive); if (try_to_shrink_lruvec(lruvec, sc)) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); clear_mm_walk(); blk_finish_plug(&plug); } static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) { struct blk_plug plug; unsigned long reclaimed = sc->nr_reclaimed; VM_WARN_ON_ONCE(!root_reclaim(sc)); /* * Unmapped clean folios are already prioritized. Scanning for more of * them is likely futile and can cause high reclaim latency when there * is a large number of memcgs. */ if (!sc->may_writepage || !sc->may_unmap) goto done; lru_add_drain(); blk_start_plug(&plug); set_mm_walk(pgdat, sc->proactive); set_initial_priority(pgdat, sc); if (current_is_kswapd()) sc->nr_reclaimed = 0; if (mem_cgroup_disabled()) shrink_one(&pgdat->__lruvec, sc); else shrink_many(pgdat, sc); if (current_is_kswapd()) sc->nr_reclaimed += reclaimed; clear_mm_walk(); blk_finish_plug(&plug); done: if (sc->nr_reclaimed > reclaimed) atomic_set(&pgdat->kswapd_failures, 0); } /****************************************************************************** * state change ******************************************************************************/ static bool __maybe_unused state_is_valid(struct lruvec *lruvec) { struct lru_gen_folio *lrugen = &lruvec->lrugen; if (lrugen->enabled) { enum lru_list lru; for_each_evictable_lru(lru) { if (!list_empty(&lruvec->lists[lru])) return false; } } else { int gen, type, zone; for_each_gen_type_zone(gen, type, zone) { if (!list_empty(&lrugen->folios[gen][type][zone])) return false; } } return true; } static bool fill_evictable(struct lruvec *lruvec) { enum lru_list lru; int remaining = MAX_LRU_BATCH; for_each_evictable_lru(lru) { int type = is_file_lru(lru); bool active = is_active_lru(lru); struct list_head *head = &lruvec->lists[lru]; while (!list_empty(head)) { bool success; struct folio *folio = lru_to_folio(head); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio); VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio); lruvec_del_folio(lruvec, folio); success = lru_gen_add_folio(lruvec, folio, false); VM_WARN_ON_ONCE(!success); if (!--remaining) return false; } } return true; } static bool drain_evictable(struct lruvec *lruvec) { int gen, type, zone; int remaining = MAX_LRU_BATCH; for_each_gen_type_zone(gen, type, zone) { struct list_head *head = &lruvec->lrugen.folios[gen][type][zone]; while (!list_empty(head)) { bool success; struct folio *folio = lru_to_folio(head); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); success = lru_gen_del_folio(lruvec, folio, false); VM_WARN_ON_ONCE(!success); lruvec_add_folio(lruvec, folio); if (!--remaining) return false; } } return true; } static void lru_gen_change_state(bool enabled) { static DEFINE_MUTEX(state_mutex); struct mem_cgroup *memcg; cgroup_lock(); cpus_read_lock(); get_online_mems(); mutex_lock(&state_mutex); if (enabled == lru_gen_enabled()) goto unlock; if (enabled) static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); else static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { int nid; for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); VM_WARN_ON_ONCE(!state_is_valid(lruvec)); lruvec->lrugen.enabled = enabled; while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { spin_unlock_irq(&lruvec->lru_lock); cond_resched(); spin_lock_irq(&lruvec->lru_lock); } spin_unlock_irq(&lruvec->lru_lock); } cond_resched(); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); unlock: mutex_unlock(&state_mutex); put_online_mems(); cpus_read_unlock(); cgroup_unlock(); } /****************************************************************************** * sysfs interface ******************************************************************************/ static ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ static ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { unsigned int msecs; if (kstrtouint(buf, 0, &msecs)) return -EINVAL; WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); return len; } static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms); static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned int caps = 0; if (get_cap(LRU_GEN_CORE)) caps |= BIT(LRU_GEN_CORE); if (should_walk_mmu()) caps |= BIT(LRU_GEN_MM_WALK); if (should_clear_pmd_young()) caps |= BIT(LRU_GEN_NONLEAF_YOUNG); return sysfs_emit(buf, "0x%04x\n", caps); } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { int i; unsigned int caps; if (tolower(*buf) == 'n') caps = 0; else if (tolower(*buf) == 'y') caps = -1; else if (kstrtouint(buf, 0, &caps)) return -EINVAL; for (i = 0; i < NR_LRU_GEN_CAPS; i++) { bool enabled = caps & BIT(i); if (i == LRU_GEN_CORE) lru_gen_change_state(enabled); else if (enabled) static_branch_enable(&lru_gen_caps[i]); else static_branch_disable(&lru_gen_caps[i]); } return len; } static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled); static struct attribute *lru_gen_attrs[] = { &lru_gen_min_ttl_attr.attr, &lru_gen_enabled_attr.attr, NULL }; static const struct attribute_group lru_gen_attr_group = { .name = "lru_gen", .attrs = lru_gen_attrs, }; /****************************************************************************** * debugfs interface ******************************************************************************/ static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) { struct mem_cgroup *memcg; loff_t nr_to_skip = *pos; m->private = kvmalloc(PATH_MAX, GFP_KERNEL); if (!m->private) return ERR_PTR(-ENOMEM); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { int nid; for_each_node_state(nid, N_MEMORY) { if (!nr_to_skip--) return get_lruvec(memcg, nid); } } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); return NULL; } static void lru_gen_seq_stop(struct seq_file *m, void *v) { if (!IS_ERR_OR_NULL(v)) mem_cgroup_iter_break(NULL, lruvec_memcg(v)); kvfree(m->private); m->private = NULL; } static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) { int nid = lruvec_pgdat(v)->node_id; struct mem_cgroup *memcg = lruvec_memcg(v); ++*pos; nid = next_memory_node(nid); if (nid == MAX_NUMNODES) { memcg = mem_cgroup_iter(NULL, memcg, NULL); if (!memcg) return NULL; nid = first_memory_node; } return get_lruvec(memcg, nid); } static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, unsigned long seq) { int i; int type, tier; int hist = lru_hist_from_seq(seq); struct lru_gen_folio *lrugen = &lruvec->lrugen; struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); for (tier = 0; tier < MAX_NR_TIERS; tier++) { seq_printf(m, " %10d", tier); for (type = 0; type < ANON_AND_FILE; type++) { const char *s = "xxx"; unsigned long n[3] = {}; if (seq == max_seq) { s = "RTx"; n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); n[1] = READ_ONCE(lrugen->avg_total[type][tier]); } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { s = "rep"; n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); n[2] = READ_ONCE(lrugen->protected[hist][type][tier]); } for (i = 0; i < 3; i++) seq_printf(m, " %10lu%c", n[i], s[i]); } seq_putc(m, '\n'); } if (!mm_state) return; seq_puts(m, " "); for (i = 0; i < NR_MM_STATS; i++) { const char *s = "xxxx"; unsigned long n = 0; if (seq == max_seq && NR_HIST_GENS == 1) { s = "TYFA"; n = READ_ONCE(mm_state->stats[hist][i]); } else if (seq != max_seq && NR_HIST_GENS > 1) { s = "tyfa"; n = READ_ONCE(mm_state->stats[hist][i]); } seq_printf(m, " %10lu%c", n, s[i]); } seq_putc(m, '\n'); } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ static int lru_gen_seq_show(struct seq_file *m, void *v) { unsigned long seq; bool full = debugfs_get_aux_num(m->file); struct lruvec *lruvec = v; struct lru_gen_folio *lrugen = &lruvec->lrugen; int nid = lruvec_pgdat(lruvec)->node_id; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); if (nid == first_memory_node) { const char *path = memcg ? m->private : ""; #ifdef CONFIG_MEMCG if (memcg) cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); #endif seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); } seq_printf(m, " node %5d\n", nid); if (!full) seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2); else if (max_seq >= MAX_NR_GENS) seq = max_seq - MAX_NR_GENS + 1; else seq = 0; for (; seq <= max_seq; seq++) { int type, zone; int gen = lru_gen_from_seq(seq); unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); for (type = 0; type < ANON_AND_FILE; type++) { unsigned long size = 0; char mark = full && seq < min_seq[type] ? 'x' : ' '; for (zone = 0; zone < MAX_NR_ZONES; zone++) size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); seq_printf(m, " %10lu%c", size, mark); } seq_putc(m, '\n'); if (full) lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); } return 0; } static const struct seq_operations lru_gen_seq_ops = { .start = lru_gen_seq_start, .stop = lru_gen_seq_stop, .next = lru_gen_seq_next, .show = lru_gen_seq_show, }; static int run_aging(struct lruvec *lruvec, unsigned long seq, int swappiness, bool force_scan) { DEFINE_MAX_SEQ(lruvec); if (seq > max_seq) return -EINVAL; return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST; } static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, int swappiness, unsigned long nr_to_reclaim) { DEFINE_MAX_SEQ(lruvec); if (seq + MIN_NR_GENS > max_seq) return -EINVAL; sc->nr_reclaimed = 0; while (!signal_pending(current)) { DEFINE_MIN_SEQ(lruvec); if (seq < evictable_min_seq(min_seq, swappiness)) return 0; if (sc->nr_reclaimed >= nr_to_reclaim) return 0; if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc, swappiness)) return 0; cond_resched(); } return -EINTR; } static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, struct scan_control *sc, int swappiness, unsigned long opt) { struct lruvec *lruvec; int err = -EINVAL; struct mem_cgroup *memcg = NULL; if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) return -EINVAL; if (!mem_cgroup_disabled()) { rcu_read_lock(); memcg = mem_cgroup_from_id(memcg_id); if (!mem_cgroup_tryget(memcg)) memcg = NULL; rcu_read_unlock(); if (!memcg) return -EINVAL; } if (memcg_id != mem_cgroup_id(memcg)) goto done; sc->target_mem_cgroup = memcg; lruvec = get_lruvec(memcg, nid); if (swappiness < MIN_SWAPPINESS) swappiness = get_swappiness(lruvec, sc); else if (swappiness > SWAPPINESS_ANON_ONLY) goto done; switch (cmd) { case '+': err = run_aging(lruvec, seq, swappiness, opt); break; case '-': err = run_eviction(lruvec, seq, sc, swappiness, opt); break; } done: mem_cgroup_put(memcg); return err; } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, size_t len, loff_t *pos) { void *buf; char *cur, *next; unsigned int flags; struct blk_plug plug; int err = -EINVAL; struct scan_control sc = { .may_writepage = true, .may_unmap = true, .may_swap = true, .reclaim_idx = MAX_NR_ZONES - 1, .gfp_mask = GFP_KERNEL, .proactive = true, }; buf = kvmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; if (copy_from_user(buf, src, len)) { kvfree(buf); return -EFAULT; } set_task_reclaim_state(current, &sc.reclaim_state); flags = memalloc_noreclaim_save(); blk_start_plug(&plug); if (!set_mm_walk(NULL, true)) { err = -ENOMEM; goto done; } next = buf; next[len] = '\0'; while ((cur = strsep(&next, ",;\n"))) { int n; int end; char cmd, swap_string[5]; unsigned int memcg_id; unsigned int nid; unsigned long seq; unsigned int swappiness; unsigned long opt = -1; cur = skip_spaces(cur); if (!*cur) continue; n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid, &seq, &end, swap_string, &end, &opt, &end); if (n < 4 || cur[end]) { err = -EINVAL; break; } if (n == 4) { swappiness = -1; } else if (!strcmp("max", swap_string)) { /* set by userspace for anonymous memory only */ swappiness = SWAPPINESS_ANON_ONLY; } else { err = kstrtouint(swap_string, 0, &swappiness); if (err) break; } err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); if (err) break; } done: clear_mm_walk(); blk_finish_plug(&plug); memalloc_noreclaim_restore(flags); set_task_reclaim_state(current, NULL); kvfree(buf); return err ? : len; } static int lru_gen_seq_open(struct inode *inode, struct file *file) { return seq_open(file, &lru_gen_seq_ops); } static const struct file_operations lru_gen_rw_fops = { .open = lru_gen_seq_open, .read = seq_read, .write = lru_gen_seq_write, .llseek = seq_lseek, .release = seq_release, }; static const struct file_operations lru_gen_ro_fops = { .open = lru_gen_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; /****************************************************************************** * initialization ******************************************************************************/ void lru_gen_init_pgdat(struct pglist_data *pgdat) { int i, j; spin_lock_init(&pgdat->memcg_lru.lock); for (i = 0; i < MEMCG_NR_GENS; i++) { for (j = 0; j < MEMCG_NR_BINS; j++) INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); } } void lru_gen_init_lruvec(struct lruvec *lruvec) { int i; int gen, type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); lrugen->max_seq = MIN_NR_GENS + 1; lrugen->enabled = lru_gen_enabled(); for (i = 0; i <= MIN_NR_GENS + 1; i++) lrugen->timestamps[i] = jiffies; for_each_gen_type_zone(gen, type, zone) INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); if (mm_state) mm_state->seq = MIN_NR_GENS; } #ifdef CONFIG_MEMCG void lru_gen_init_memcg(struct mem_cgroup *memcg) { struct lru_gen_mm_list *mm_list = get_mm_list(memcg); if (!mm_list) return; INIT_LIST_HEAD(&mm_list->fifo); spin_lock_init(&mm_list->lock); } void lru_gen_exit_memcg(struct mem_cgroup *memcg) { int i; int nid; struct lru_gen_mm_list *mm_list = get_mm_list(memcg); VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo)); for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, sizeof(lruvec->lrugen.nr_pages))); lruvec->lrugen.list.next = LIST_POISON1; if (!mm_state) continue; for (i = 0; i < NR_BLOOM_FILTERS; i++) { bitmap_free(mm_state->filters[i]); mm_state->filters[i] = NULL; } } } #endif /* CONFIG_MEMCG */ static int __init init_lru_gen(void) { BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) pr_err("lru_gen: failed to create sysfs group\n"); debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, false, &lru_gen_rw_fops); debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, true, &lru_gen_ro_fops); return 0; }; late_initcall(init_lru_gen); #else /* !CONFIG_LRU_GEN */ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { BUILD_BUG(); } static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { BUILD_BUG(); } static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) { BUILD_BUG(); } #endif /* CONFIG_LRU_GEN */ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list lru; unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; bool proportional_reclaim; struct blk_plug plug; if (lru_gen_enabled() && !root_reclaim(sc)) { lru_gen_shrink_lruvec(lruvec, sc); return; } get_scan_count(lruvec, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); /* * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal * event that can occur when there is little memory pressure e.g. * multiple streaming readers/writers. Hence, we do not abort scanning * when the requested number of pages are reclaimed when scanning at * DEF_PRIORITY on the assumption that the fact we are direct * reclaiming implies that kswapd is not keeping up and it is best to * do a batch of work at once. For memcg reclaim one check is made to * abort proportional reclaim if either the file or anon lru has already * dropped to zero at the first pass. */ proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY); blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { unsigned long nr_anon, nr_file, percentage; unsigned long nr_scanned; for_each_evictable_lru(lru) { if (nr[lru]) { nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); nr[lru] -= nr_to_scan; nr_reclaimed += shrink_list(lru, nr_to_scan, lruvec, sc); } } cond_resched(); if (nr_reclaimed < nr_to_reclaim || proportional_reclaim) continue; /* * For kswapd and memcg, reclaim at least the number of pages * requested. Ensure that the anon and file LRUs are scanned * proportionally what was requested by get_scan_count(). We * stop reclaiming one LRU and reduce the amount scanning * proportional to the original scan target. */ nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; /* * It's just vindictive to attack the larger once the smaller * has gone to zero. And given the way we stop scanning the * smaller below, this makes sure that we only make one nudge * towards proportionality once we've got nr_to_reclaim. */ if (!nr_file || !nr_anon) break; if (nr_file > nr_anon) { unsigned long scan_target = targets[LRU_INACTIVE_ANON] + targets[LRU_ACTIVE_ANON] + 1; lru = LRU_BASE; percentage = nr_anon * 100 / scan_target; } else { unsigned long scan_target = targets[LRU_INACTIVE_FILE] + targets[LRU_ACTIVE_FILE] + 1; lru = LRU_FILE; percentage = nr_file * 100 / scan_target; } /* Stop scanning the smaller of the LRU */ nr[lru] = 0; nr[lru + LRU_ACTIVE] = 0; /* * Recalculate the other LRU scan count based on its original * scan target and the percentage scanning already complete */ lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; nr_scanned = targets[lru] - nr[lru]; nr[lru] = targets[lru] * (100 - percentage) / 100; nr[lru] -= min(nr[lru], nr_scanned); lru += LRU_ACTIVE; nr_scanned = targets[lru] - nr[lru]; nr[lru] = targets[lru] * (100 - percentage) / 100; nr[lru] -= min(nr[lru], nr_scanned); } blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; /* * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ if (can_age_anon_pages(lruvec, sc) && inactive_is_low(lruvec, LRU_INACTIVE_ANON)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { if (gfp_compaction_allowed(sc->gfp_mask) && sc->order && (sc->order > PAGE_ALLOC_COSTLY_ORDER || sc->priority < DEF_PRIORITY - 2)) return true; return false; } /* * Reclaim/compaction is used for high-order allocation requests. It reclaims * order-0 pages before compacting the zone. should_continue_reclaim() returns * true if more pages should be reclaimed such that when the page allocator * calls try_to_compact_pages() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ static inline bool should_continue_reclaim(struct pglist_data *pgdat, unsigned long nr_reclaimed, struct scan_control *sc) { unsigned long pages_for_compaction; unsigned long inactive_lru_pages; int z; struct zone *zone; /* If not in reclaim/compaction mode, stop */ if (!in_reclaim_compaction(sc)) return false; /* * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX * number of pages that were scanned. This will return to the caller * with the risk reclaim/compaction and the resulting allocation attempt * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL * allocations through requiring that the full LRU list has been scanned * first, by assuming that zero delta of sc->nr_scanned means full LRU * scan, but that approximation was wrong, and there were corner cases * where always a non-zero amount of pages were scanned. */ if (!nr_reclaimed) return false; /* If compaction would go ahead or the allocation would succeed, stop */ for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) { unsigned long watermark = min_wmark_pages(zone); /* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, sc->order, watermark, sc->reclaim_idx, 0)) return false; if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx)) return false; } /* * If we have not reclaimed enough pages for compaction and the * inactive lists are large enough, continue reclaiming */ pages_for_compaction = compact_gap(sc->order); inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); return inactive_lru_pages > pages_for_compaction; } static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { struct mem_cgroup *target_memcg = sc->target_mem_cgroup; struct mem_cgroup_reclaim_cookie reclaim = { .pgdat = pgdat, }; struct mem_cgroup_reclaim_cookie *partial = &reclaim; struct mem_cgroup *memcg; /* * In most cases, direct reclaimers can do partial walks * through the cgroup tree, using an iterator state that * persists across invocations. This strikes a balance between * fairness and allocation latency. * * For kswapd, reliable forward progress is more important * than a quick return to idle. Always do full walks. */ if (current_is_kswapd() || sc->memcg_full_walk) partial = NULL; memcg = mem_cgroup_iter(target_memcg, NULL, partial); do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); unsigned long reclaimed; unsigned long scanned; /* * This loop can become CPU-bound when target memcgs * aren't eligible for reclaim - either because they * don't have any reclaimable pages, or because their * memory is explicitly protected. Avoid soft lockups. */ cond_resched(); mem_cgroup_calculate_protection(target_memcg, memcg); if (mem_cgroup_below_min(target_memcg, memcg)) { /* * Hard protection. * If there is no reclaimable memory, OOM. */ continue; } else if (mem_cgroup_below_low(target_memcg, memcg)) { /* * Soft protection. * Respect the protection only as long as * there is an unprotected supply * of reclaimable memory from other cgroups. */ if (!sc->memcg_low_reclaim) { sc->memcg_low_skipped = 1; continue; } memcg_memory_event(memcg, MEMCG_LOW); } reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; shrink_lruvec(lruvec, sc); shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); /* Record the group's reclaim efficiency */ if (!sc->proactive) vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); /* If partial walks are allowed, bail once goal is reached */ if (partial && sc->nr_reclaimed >= sc->nr_to_reclaim) { mem_cgroup_iter_break(target_memcg, memcg); break; } } while ((memcg = mem_cgroup_iter(target_memcg, memcg, partial))); } static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed; struct lruvec *target_lruvec; bool reclaimable = false; if (lru_gen_enabled() && root_reclaim(sc)) { memset(&sc->nr, 0, sizeof(sc->nr)); lru_gen_shrink_node(pgdat, sc); return; } target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); again: memset(&sc->nr, 0, sizeof(sc->nr)); nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; prepare_scan_control(pgdat, sc); shrink_node_memcgs(pgdat, sc); flush_reclaim_state(sc); nr_node_reclaimed = sc->nr_reclaimed - nr_reclaimed; /* Record the subtree's reclaim efficiency */ if (!sc->proactive) vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, nr_node_reclaimed); if (nr_node_reclaimed) reclaimable = true; if (current_is_kswapd()) { /* * If reclaim is isolating dirty pages under writeback, * it implies that the long-lived page allocation rate * is exceeding the page laundering rate. Either the * global limits are not being effective at throttling * processes due to the page distribution throughout * zones or there is heavy usage of a slow backing * device. The only option is to throttle from reclaim * context which is not ideal as there is no guarantee * the dirtying process is throttled in the same way * balance_dirty_pages() manages. * * Once a node is flagged PGDAT_WRITEBACK, kswapd will * count the number of pages under pages flagged for * immediate reclaim and stall if any are encountered * in the nr_immediate check below. */ if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); /* * If kswapd scans pages marked for immediate * reclaim and under writeback (nr_immediate), it * implies that pages are cycling through the LRU * faster than they are written so forcibly stall * until some pages complete writeback. */ if (sc->nr.immediate) reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); } /* * Tag a node/memcg as congested if all the dirty pages were marked * for writeback and immediate reclaim (counted in nr.congested). * * Legacy memcg will stall in page writeback so avoid forcibly * stalling in reclaim_throttle(). */ if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) { if (cgroup_reclaim(sc) && writeback_throttling_sane(sc)) set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags); if (current_is_kswapd()) set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags); } /* * Stall direct reclaim for IO completions if the lruvec is * node is congested. Allow kswapd to continue until it * starts encountering unqueued dirty pages or cycling through * the LRU too quickly. */ if (!current_is_kswapd() && current_may_throttle() && !sc->hibernation_mode && (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) || test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags))) reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc)) goto again; /* * Kswapd gives up on balancing particular nodes after too * many failures to reclaim anything from them and goes to * sleep. On reclaim progress, reset the failure counter. A * successful direct reclaim run will revive a dormant kswapd. */ if (reclaimable) atomic_set(&pgdat->kswapd_failures, 0); else if (sc->cache_trim_mode) sc->cache_trim_mode_failed = 1; } /* * Returns true if compaction should go ahead for a costly-order request, or * the allocation would already succeed without compaction. Return false if we * should reclaim first. */ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; if (!gfp_compaction_allowed(sc->gfp_mask)) return false; /* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), sc->reclaim_idx, 0)) return true; /* * Direct reclaim usually targets the min watermark, but compaction * takes time to run and there are potentially other callers using the * pages just freed. So target a higher buffer to give compaction a * reasonable chance of completing and allocating the pages. * * Note that we won't actually reclaim the whole buffer in one attempt * as the target watermark in should_continue_reclaim() is lower. But if * we are already above the high+gap watermark, don't reclaim at all. */ watermark = high_wmark_pages(zone); if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx)) return true; return false; } static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) { /* * If reclaim is making progress greater than 12% efficiency then * wake all the NOPROGRESS throttled tasks. */ if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) { wait_queue_head_t *wqh; wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; if (waitqueue_active(wqh)) wake_up(wqh); return; } /* * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages * under writeback and marked for immediate reclaim at the tail of the * LRU. */ if (current_is_kswapd() || cgroup_reclaim(sc)) return; /* Throttle if making no progress at high prioities. */ if (sc->priority == 1 && !sc->nr_reclaimed) reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); } /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation * request. * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; gfp_t orig_mask; pg_data_t *last_pgdat = NULL; pg_data_t *first_pgdat = NULL; /* * If the number of buffer_heads in the machine exceeds the maximum * allowed level, force direct reclaim to scan the highmem zone as * highmem pages could be pinning lowmem pages storing buffer_heads */ orig_mask = sc->gfp_mask; if (buffer_heads_over_limit) { sc->gfp_mask |= __GFP_HIGHMEM; sc->reclaim_idx = gfp_zone(sc->gfp_mask); } for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, sc->nodemask) { /* * Take care memory controller reclaiming has small influence * to global LRU. */ if (!cgroup_reclaim(sc)) { if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) continue; /* * If we already have plenty of memory free for * compaction in this zone, don't free any more. * Even though compaction is invoked for any * non-zero order, only frequent costly order * reclamation is disruptive enough to become a * noticeable problem, like transparent huge * page allocations. */ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order > PAGE_ALLOC_COSTLY_ORDER && compaction_ready(zone, sc)) { sc->compaction_ready = true; continue; } /* * Shrink each node in the zonelist once. If the * zonelist is ordered by zone (not the default) then a * node may be shrunk multiple times but in that case * the user prefers lower zones being preserved. */ if (zone->zone_pgdat == last_pgdat) continue; /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and * scanned pages. This works for global memory pressure * and balancing, not for a memcg's limit. */ nr_soft_scanned = 0; nr_soft_reclaimed = memcg1_soft_limit_reclaim(zone->zone_pgdat, sc->order, sc->gfp_mask, &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; /* need some check for avoid more shrink_zone() */ } if (!first_pgdat) first_pgdat = zone->zone_pgdat; /* See comment about same check for global reclaim above */ if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; shrink_node(zone->zone_pgdat, sc); } if (first_pgdat) consider_reclaim_throttle(first_pgdat, sc); /* * Restore to original mask to avoid the impact on the caller if we * promoted it to __GFP_HIGHMEM. */ sc->gfp_mask = orig_mask; } static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) { struct lruvec *target_lruvec; unsigned long refaults; if (lru_gen_enabled()) return; target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); target_lruvec->refaults[WORKINGSET_ANON] = refaults; refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); target_lruvec->refaults[WORKINGSET_FILE] = refaults; } /* * This is the main entry point to direct page reclaim. * * If a full scan of the inactive list fails to free enough memory then we * are "out of memory" and something needs to be killed. * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this * caller can't do much about. We kick the writeback threads and take explicit * naps in the hope that some of these pages can be written. But if the * allocating task holds filesystem locks which prevent writeout this might not * work, and the allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed */ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int initial_priority = sc->priority; pg_data_t *last_pgdat; struct zoneref *z; struct zone *zone; retry: delayacct_freepages_start(); if (!cgroup_reclaim(sc)) __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); do { if (!sc->proactive) vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; shrink_zones(zonelist, sc); if (sc->nr_reclaimed >= sc->nr_to_reclaim) break; if (sc->compaction_ready) break; /* * If we're getting trouble reclaiming, start doing * writepage even in laptop mode. */ if (sc->priority < DEF_PRIORITY - 2) sc->may_writepage = 1; } while (--sc->priority >= 0); last_pgdat = NULL; for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, sc->nodemask) { if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); if (cgroup_reclaim(sc)) { struct lruvec *lruvec; lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, zone->zone_pgdat); clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); } } delayacct_freepages_end(); if (sc->nr_reclaimed) return sc->nr_reclaimed; /* Aborted reclaim to try compaction? don't OOM, then */ if (sc->compaction_ready) return 1; /* * In most cases, direct reclaimers can do partial walks * through the cgroup tree to meet the reclaim goal while * keeping latency low. Since the iterator state is shared * among all direct reclaim invocations (to retain fairness * among cgroups), though, high concurrency can result in * individual threads not seeing enough cgroups to make * meaningful forward progress. Avoid false OOMs in this case. */ if (!sc->memcg_full_walk) { sc->priority = initial_priority; sc->memcg_full_walk = 1; goto retry; } /* * We make inactive:active ratio decisions based on the node's * composition of memory, but a restrictive reclaim_idx or a * memory.low cgroup setting can exempt large amounts of * memory from reclaim. Neither of which are very common, so * instead of doing costly eligibility calculations of the * entire cgroup subtree up front, we assume the estimates are * good, and retry with forcible deactivation if that fails. */ if (sc->skipped_deactivate) { sc->priority = initial_priority; sc->force_deactivate = 1; sc->skipped_deactivate = 0; goto retry; } /* Untapped cgroup reserves? Don't OOM, retry. */ if (sc->memcg_low_skipped) { sc->priority = initial_priority; sc->force_deactivate = 0; sc->memcg_low_reclaim = 1; sc->memcg_low_skipped = 0; goto retry; } return 0; } static bool allow_direct_reclaim(pg_data_t *pgdat) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; unsigned long free_pages = 0; int i; bool wmark_ok; if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { if (!zone_reclaimable_pages(zone) && zone_page_state_snapshot(zone, NR_FREE_PAGES)) continue; pfmemalloc_reserve += min_wmark_pages(zone); free_pages += zone_page_state_snapshot(zone, NR_FREE_PAGES); } /* If there are no reserves (unexpected config) then do not throttle */ if (!pfmemalloc_reserve) return true; wmark_ok = free_pages > pfmemalloc_reserve / 2; /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); wake_up_interruptible(&pgdat->kswapd_wait); } return wmark_ok; } /* * Throttle direct reclaimers if backing storage is backed by the network * and the PFMEMALLOC reserve for the preferred node is getting dangerously * depleted. kswapd will continue to make progress and wake the processes * when the low watermark is reached. * * Returns true if a fatal signal was delivered during throttling. If this * happens, the page allocator should not consider triggering the OOM killer. */ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, nodemask_t *nodemask) { struct zoneref *z; struct zone *zone; pg_data_t *pgdat = NULL; /* * Kernel threads should not be throttled as they may be indirectly * responsible for cleaning pages necessary for reclaim to make forward * progress. kjournald for example may enter direct reclaim while * committing a transaction where throttling it could forcing other * processes to block on log_wait_commit(). */ if (current->flags & PF_KTHREAD) goto out; /* * If a fatal signal is pending, this process should not throttle. * It should return quickly so it can exit and free its memory */ if (fatal_signal_pending(current)) goto out; /* * Check if the pfmemalloc reserves are ok by finding the first node * with a usable ZONE_NORMAL or lower zone. The expectation is that * GFP_KERNEL will be required for allocating network buffers when * swapping over the network so ZONE_HIGHMEM is unusable. * * Throttling is based on the first usable node and throttled processes * wait on a queue until kswapd makes progress and wakes them. There * is an affinity then between processes waking up and where reclaim * progress has been made assuming the process wakes on the same node. * More importantly, processes running on remote nodes will not compete * for remote pfmemalloc reserves and processes on different nodes * should make reasonable progress. */ for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { if (zone_idx(zone) > ZONE_NORMAL) continue; /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; if (allow_direct_reclaim(pgdat)) goto out; break; } /* If no zone was usable by the allocation flags then do not throttle */ if (!pgdat) goto out; /* Account for the throttling */ count_vm_event(PGSCAN_DIRECT_THROTTLE); /* * If the caller cannot enter the filesystem, it's possible that it * is due to the caller holding an FS lock or performing a journal * transaction in the case of a filesystem like ext[3|4]. In this case, * it is not safe to block on pfmemalloc_wait as kswapd could be * blocked waiting on the same lock. Instead, throttle for up to a * second before continuing. */ if (!(gfp_mask & __GFP_FS)) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, allow_direct_reclaim(pgdat), HZ); else /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, allow_direct_reclaim(pgdat)); if (fatal_signal_pending(current)) return true; out: return false; } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { unsigned long nr_reclaimed; struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .gfp_mask = current_gfp_context(gfp_mask), .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = 1, }; /* * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values. */ BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX); BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); /* * Do not enter reclaim if fatal signal was delivered while throttled. * 1 is returned so that the page allocator does not OOM kill at this * point. */ if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; set_task_reclaim_state(current, &sc.reclaim_state); trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); set_task_reclaim_state(current, NULL); return nr_reclaimed; } #ifdef CONFIG_MEMCG /* Only used by soft limit reclaim. Do not reuse for anything else. */ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, unsigned long *nr_scanned) { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .target_mem_cgroup = memcg, .may_writepage = !laptop_mode, .may_unmap = 1, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, }; WARN_ON_ONCE(!current->reclaim_state); sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, sc.gfp_mask); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. * if we don't reclaim here, the shrink_node from balance_pgdat * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ shrink_lruvec(lruvec, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); *nr_scanned = sc.nr_scanned; return sc.nr_reclaimed; } unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, unsigned int reclaim_options, int *swappiness) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .proactive_swappiness = swappiness, .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put * equal pressure on all the nodes. This is based on the assumption that * the reclaim does not bail out early. */ struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); set_task_reclaim_state(current, &sc.reclaim_state); trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); memalloc_noreclaim_restore(noreclaim_flag); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); set_task_reclaim_state(current, NULL); return nr_reclaimed; } #else unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, unsigned int reclaim_options, int *swappiness) { return 0; } #endif static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; struct lruvec *lruvec; if (lru_gen_enabled()) { lru_gen_age_node(pgdat, sc); return; } lruvec = mem_cgroup_lruvec(NULL, pgdat); if (!can_age_anon_pages(lruvec, sc)) return; if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) return; memcg = mem_cgroup_iter(NULL, NULL, NULL); do { lruvec = mem_cgroup_lruvec(memcg, pgdat); shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); memcg = mem_cgroup_iter(NULL, memcg, NULL); } while (memcg); } static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) { int i; struct zone *zone; /* * Check for watermark boosts top-down as the higher zones * are more likely to be boosted. Both watermarks and boosts * should not be checked at the same time as reclaim would * start prematurely when there is no boosting and a lower * zone is balanced. */ for (i = highest_zoneidx; i >= 0; i--) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; if (zone->watermark_boost) return true; } return false; } /* * Returns true if there is an eligible zone balanced for the request order * and highest_zoneidx */ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long mark = -1; struct zone *zone; /* * Check watermarks bottom-up as lower zones are more likely to * meet watermarks. */ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { enum zone_stat_item item; unsigned long free_pages; if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) mark = promo_wmark_pages(zone); else mark = high_wmark_pages(zone); /* * In defrag_mode, watermarks must be met in whole * blocks to avoid polluting allocator fallbacks. * * However, kswapd usually cannot accomplish this on * its own and needs kcompactd support. Once it's * reclaimed a compaction gap, and kswapd_shrink_node * has dropped order, simply ensure there are enough * base pages for compaction, wake kcompactd & sleep. */ if (defrag_mode && order) item = NR_FREE_PAGES_BLOCKS; else item = NR_FREE_PAGES; /* * When there is a high number of CPUs in the system, * the cumulative error from the vmstat per-cpu cache * can blur the line between the watermarks. In that * case, be safe and get an accurate snapshot. * * TODO: NR_FREE_PAGES_BLOCKS moves in steps of * pageblock_nr_pages, while the vmstat pcp threshold * is limited to 125. On many configurations that * counter won't actually be per-cpu cached. But keep * things simple for now; revisit when somebody cares. */ free_pages = zone_page_state(zone, item); if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark) free_pages = zone_page_state_snapshot(zone, item); if (__zone_watermark_ok(zone, order, mark, highest_zoneidx, 0, free_pages)) return true; } /* * If a node has no managed zone within highest_zoneidx, it does not * need balancing by definition. This can happen if a zone-restricted * allocation tries to wake a remote kswapd. */ if (mark == -1) return true; return false; } /* Clear pgdat state for congested, dirty or under writeback. */ static void clear_pgdat_congested(pg_data_t *pgdat) { struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags); clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } /* * Prepare kswapd for sleeping. This verifies that there are no processes * waiting in throttle_direct_reclaim() and that watermarks have been met. * * Returns true if kswapd is ready to sleep */ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int highest_zoneidx) { /* * The throttled processes are normally woken up in balance_pgdat() as * soon as allow_direct_reclaim() is true. But there is a potential * race between when kswapd checks the watermarks and a process gets * throttled. There is also a potential race if processes get * throttled, kswapd wakes, a large process exits thereby balancing the * zones, which causes kswapd to exit balance_pgdat() before reaching * the wake up checks. If kswapd is going to sleep, no process should * be sleeping on pfmemalloc_wait, so wake them now if necessary. If * the wake up is premature, processes will wake kswapd and get * throttled again. The difference from wake ups in balance_pgdat() is * that here we are under prepare_to_wait(). */ if (waitqueue_active(&pgdat->pfmemalloc_wait)) wake_up_all(&pgdat->pfmemalloc_wait); /* Hopeless node, leave it to direct reclaim */ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; if (pgdat_balanced(pgdat, order, highest_zoneidx)) { clear_pgdat_congested(pgdat); return true; } return false; } /* * kswapd shrinks a node of pages that are at or below the highest usable * zone that is currently unbalanced. * * Returns true if kswapd scanned at least the requested number of pages to * reclaim or if the lack of progress was due to pages under writeback. * This is used to determine if the scanning priority needs to be raised. */ static bool kswapd_shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct zone *zone; int z; unsigned long nr_reclaimed = sc->nr_reclaimed; /* Reclaim a number of pages proportional to the number of zones */ sc->nr_to_reclaim = 0; for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) { sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); } /* * Historically care was taken to put equal pressure on all zones but * now pressure is applied based on node LRU order. */ shrink_node(pgdat, sc); /* * Fragmentation may mean that the system cannot be rebalanced for * high-order allocations. If twice the allocation size has been * reclaimed then recheck watermarks only at order-0 to prevent * excessive reclaim. Assume that a process requested a high-order * can direct reclaim/compact. */ if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) sc->order = 0; /* account for progress from mm_account_reclaimed_pages() */ return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim; } /* Page allocator PCP high watermark is lowered if reclaim is active. */ static inline void update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active) { int i; struct zone *zone; for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { if (active) set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); else clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); } } static inline void set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) { update_reclaim_active(pgdat, highest_zoneidx, true); } static inline void clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) { update_reclaim_active(pgdat, highest_zoneidx, false); } /* * For kswapd, balance_pgdat() will reclaim pages across a node from zones * that are eligible for use by the caller until at least one zone is * balanced. * * Returns the order kswapd finished reclaiming at. * * kswapd scans the zones in the highmem->normal->dma direction. It skips * zones which have free_pages > high_wmark_pages(zone), but once a zone is * found to have free_pages <= high_wmark_pages(zone), any page in that zone * or lower is eligible for reclaim until at least one usable zone is * balanced. */ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; unsigned long pflags; unsigned long nr_boost_reclaim; unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; bool boosted; struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .order = order, .may_unmap = 1, }; set_task_reclaim_state(current, &sc.reclaim_state); psi_memstall_enter(&pflags); __fs_reclaim_acquire(_THIS_IP_); count_vm_event(PAGEOUTRUN); /* * Account for the reclaim boost. Note that the zone boost is left in * place so that parallel allocations that are near the watermark will * stall or direct reclaim until kswapd is finished. */ nr_boost_reclaim = 0; for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { nr_boost_reclaim += zone->watermark_boost; zone_boosts[i] = zone->watermark_boost; } boosted = nr_boost_reclaim; restart: set_reclaim_active(pgdat, highest_zoneidx); sc.priority = DEF_PRIORITY; do { unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; bool balanced; bool ret; bool was_frozen; sc.reclaim_idx = highest_zoneidx; /* * If the number of buffer_heads exceeds the maximum allowed * then consider reclaiming from all zones. This has a dual * purpose -- on 64-bit systems it is expected that * buffer_heads are stripped during active rotation. On 32-bit * systems, highmem pages can pin lowmem memory and shrinking * buffers can relieve lowmem pressure. Reclaim may still not * go ahead if all eligible zones for the original allocation * request are balanced to avoid excessive reclaim from kswapd. */ if (buffer_heads_over_limit) { for (i = MAX_NR_ZONES - 1; i >= 0; i--) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; sc.reclaim_idx = i; break; } } /* * If the pgdat is imbalanced then ignore boosting and preserve * the watermarks for a later time and restart. Note that the * zone watermarks will be still reset at the end of balancing * on the grounds that the normal reclaim should be enough to * re-evaluate if boosting is required when kswapd next wakes. */ balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); if (!balanced && nr_boost_reclaim) { nr_boost_reclaim = 0; goto restart; } /* * If boosting is not active then only reclaim if there are no * eligible zones. Note that sc.reclaim_idx is not used as * buffer_heads_over_limit may have adjusted it. */ if (!nr_boost_reclaim && balanced) goto out; /* Limit the priority of boosting to avoid reclaim writeback */ if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) raise_priority = false; /* * Do not writeback or swap pages for boosted reclaim. The * intent is to relieve pressure not issue sub-optimal IO * from reclaim context. If no pages are reclaimed, the * reclaim will be aborted. */ sc.may_writepage = !laptop_mode && !nr_boost_reclaim; sc.may_swap = !nr_boost_reclaim; /* * Do some background aging, to give pages a chance to be * referenced before reclaiming. All pages are rotated * regardless of classzone as this is about consistent aging. */ kswapd_age_node(pgdat, &sc); /* * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. */ if (sc.priority < DEF_PRIORITY - 2) sc.may_writepage = 1; /* Call soft limit reclaim before calling shrink_node. */ sc.nr_scanned = 0; nr_soft_scanned = 0; nr_soft_reclaimed = memcg1_soft_limit_reclaim(pgdat, sc.order, sc.gfp_mask, &nr_soft_scanned); sc.nr_reclaimed += nr_soft_reclaimed; /* * There should be no need to raise the scanning priority if * enough pages are already being scanned that that high * watermark would be met at 100% efficiency. */ if (kswapd_shrink_node(pgdat, &sc)) raise_priority = false; /* * If the low watermark is met there is no need for processes * to be throttled on pfmemalloc_wait as they should not be * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && allow_direct_reclaim(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ __fs_reclaim_release(_THIS_IP_); ret = kthread_freezable_should_stop(&was_frozen); __fs_reclaim_acquire(_THIS_IP_); if (was_frozen || ret) break; /* * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); /* * If reclaim made no progress for a boost, stop reclaim as * IO cannot be queued and it could be an infinite loop in * extreme circumstances. */ if (nr_boost_reclaim && !nr_reclaimed) break; if (raise_priority || !nr_reclaimed) sc.priority--; } while (sc.priority >= 1); /* * Restart only if it went through the priority loop all the way, * but cache_trim_mode didn't work. */ if (!sc.nr_reclaimed && sc.priority < 1 && !sc.no_cache_trim_mode && sc.cache_trim_mode_failed) { sc.no_cache_trim_mode = 1; goto restart; } /* * If the reclaim was boosted, we might still be far from the * watermark_high at this point. We need to avoid increasing the * failure count to prevent the kswapd thread from stopping. */ if (!sc.nr_reclaimed && !boosted) atomic_inc(&pgdat->kswapd_failures); out: clear_reclaim_active(pgdat, highest_zoneidx); /* If reclaim was boosted, account for the reclaim done in this pass */ if (boosted) { unsigned long flags; for (i = 0; i <= highest_zoneidx; i++) { if (!zone_boosts[i]) continue; /* Increments are under the zone lock */ zone = pgdat->node_zones + i; spin_lock_irqsave(&zone->lock, flags); zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); spin_unlock_irqrestore(&zone->lock, flags); } /* * As there is now likely space, wakeup kcompact to defragment * pageblocks. */ wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); } snapshot_refaults(NULL, pgdat); __fs_reclaim_release(_THIS_IP_); psi_memstall_leave(&pflags); set_task_reclaim_state(current, NULL); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller * entered the allocator slow path while kswapd was awake, order will * remain at the higher level. */ return sc.order; } /* * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is * not a valid index then either kswapd runs for first time or kswapd couldn't * sleep after previous reclaim attempt (node is still unbalanced). In that * case return the zone index of the previous kswapd reclaim cycle. */ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, enum zone_type prev_highest_zoneidx) { enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; } static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, unsigned int highest_zoneidx) { long remaining = 0; DEFINE_WAIT(wait); if (freezing(current) || kthread_should_stop()) return; prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* * Try to sleep for a short interval. Note that kcompactd will only be * woken if it is possible to sleep for a short interval. This is * deliberate on the assumption that if reclaim cannot keep an * eligible zone balanced that it's also unlikely that compaction will * succeed. */ if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. * When kswapd is going to sleep, it is reasonable to assume * that pages and compaction may succeed so reset the cache. */ reset_isolation_suitable(pgdat); /* * We have freed the memory, now we should compact it to make * allocation of the requested order possible. */ wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); remaining = schedule_timeout(HZ/10); /* * If woken prematurely then reset kswapd_highest_zoneidx and * order. The values will either be from a wakeup request or * the previous request that slept prematurely. */ if (remaining) { WRITE_ONCE(pgdat->kswapd_highest_zoneidx, kswapd_highest_zoneidx(pgdat, highest_zoneidx)); if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) WRITE_ONCE(pgdat->kswapd_order, reclaim_order); } finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); } /* * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ if (!remaining && prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* * vmstat counters are not perfectly accurate and the estimated * value for counters such as NR_FREE_PAGES can deviate from the * true value by nr_online_cpus * threshold. To avoid the zone * watermarks being breached while under pressure, we reduce the * per-cpu vmstat threshold while kswapd is awake and restore * them before going back to sleep. */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); if (!kthread_should_stop()) schedule(); set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); } else { if (remaining) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); else count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); } finish_wait(&pgdat->kswapd_wait, &wait); } /* * The background pageout daemon, started as a kernel thread * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity * that frees anything up. This is needed for things like routing * etc, where we otherwise might have all activity going on in * asynchronous contexts that cannot page things out. * * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ static int kswapd(void *p) { unsigned int alloc_order, reclaim_order; unsigned int highest_zoneidx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; /* * Tell the memory management that we're a "memory allocator", * and that if we need more memory we should get access to it * regardless (see "__alloc_pages()"). "kswapd" should * never get caught in the normal page freeing logic. * * (Kswapd normally doesn't need memory anyway, but sometimes * you need a small amount of memory in order to be able to * page out something else, and this flag essentially protects * us from recursively trying to free more memory as we're * trying to free the first piece of memory in the first place). */ tsk->flags |= PF_MEMALLOC | PF_KSWAPD; set_freezable(); WRITE_ONCE(pgdat->kswapd_order, 0); WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); atomic_set(&pgdat->nr_writeback_throttled, 0); for ( ; ; ) { bool was_frozen; alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, highest_zoneidx); kswapd_try_sleep: kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, highest_zoneidx); /* Read the new order and highest_zoneidx */ alloc_order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, highest_zoneidx); WRITE_ONCE(pgdat->kswapd_order, 0); WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); if (kthread_freezable_should_stop(&was_frozen)) break; /* * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ if (was_frozen) continue; /* * Reclaim begins at the requested order but if a high-order * reclaim fails then kswapd falls back to reclaiming for * order-0. If that happens, kswapd will consider sleeping * for the order it finished reclaiming at (reclaim_order) * but kcompactd is woken to compact for the original * request (alloc_order). */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, alloc_order); reclaim_order = balance_pgdat(pgdat, alloc_order, highest_zoneidx); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD); return 0; } /* * A zone is low on free memory or too fragmented for high-order memory. If * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim * has failed or is not needed, still wake up kcompactd if only compaction is * needed. */ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, enum zone_type highest_zoneidx) { pg_data_t *pgdat; enum zone_type curr_idx; if (!managed_zone(zone)) return; if (!cpuset_zone_allowed(zone, gfp_flags)) return; pgdat = zone->zone_pgdat; curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); if (READ_ONCE(pgdat->kswapd_order) < order) WRITE_ONCE(pgdat->kswapd_order, order); if (!waitqueue_active(&pgdat->kswapd_wait)) return; /* Hopeless node, leave it to direct reclaim if possible */ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES || (pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd * and rely on compaction_suitable() to determine if it's * needed. If it fails, it will defer subsequent attempts to * ratelimit its work. */ if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) wakeup_kcompactd(pgdat, order, highest_zoneidx); return; } trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, gfp_flags); wake_up_interruptible(&pgdat->kswapd_wait); } #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of * freed pages. * * Rather than trying to age LRUs the aim is to preserve the overall * LRU order by reclaiming preferentially * inactive > active > active referenced > active mapped */ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { struct scan_control sc = { .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, .reclaim_idx = MAX_NR_ZONES - 1, .priority = DEF_PRIORITY, .may_writepage = 1, .may_unmap = 1, .may_swap = 1, .hibernation_mode = 1, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); unsigned long nr_reclaimed; unsigned int noreclaim_flag; fs_reclaim_acquire(sc.gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); set_task_reclaim_state(current, &sc.reclaim_state); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); set_task_reclaim_state(current, NULL); memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); return nr_reclaimed; } #endif /* CONFIG_HIBERNATION */ /* * This kswapd start function will be called by init and node-hot-add. */ void __meminit kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); pgdat_kswapd_lock(pgdat); if (!pgdat->kswapd) { pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ pr_err("Failed to start kswapd on node %d,ret=%ld\n", nid, PTR_ERR(pgdat->kswapd)); BUG_ON(system_state < SYSTEM_RUNNING); pgdat->kswapd = NULL; } else { wake_up_process(pgdat->kswapd); } } pgdat_kswapd_unlock(pgdat); } /* * Called by memory hotplug when all memory in a node is offlined. Caller must * be holding mem_hotplug_begin/done(). */ void __meminit kswapd_stop(int nid) { pg_data_t *pgdat = NODE_DATA(nid); struct task_struct *kswapd; pgdat_kswapd_lock(pgdat); kswapd = pgdat->kswapd; if (kswapd) { kthread_stop(kswapd); pgdat->kswapd = NULL; } pgdat_kswapd_unlock(pgdat); } static const struct ctl_table vmscan_sysctl_table[] = { { .procname = "swappiness", .data = &vm_swappiness, .maxlen = sizeof(vm_swappiness), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO_HUNDRED, }, #ifdef CONFIG_NUMA { .procname = "zone_reclaim_mode", .data = &node_reclaim_mode, .maxlen = sizeof(node_reclaim_mode), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, } #endif }; static int __init kswapd_init(void) { int nid; swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); register_sysctl_init("vm", vmscan_sysctl_table); return 0; } module_init(kswapd_init) #ifdef CONFIG_NUMA /* * Node reclaim mode * * If non-zero call node_reclaim when the number of free pages falls below * the watermarks. */ int node_reclaim_mode __read_mostly; /* * Priority for NODE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of * a zone. */ #define NODE_RECLAIM_PRIORITY 4 /* * Percentage of pages in a zone that must be unmapped for node_reclaim to * occur. */ int sysctl_min_unmapped_ratio = 1; /* * If the number of slab pages in a zone grows beyond this percentage then * slab reclaim needs to occur. */ int sysctl_min_slab_ratio = 5; static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) { unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED); unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) + node_page_state(pgdat, NR_ACTIVE_FILE); /* * It's possible for there to be more file mapped pages than * accounted for by the pages on the file LRU lists because * tmpfs pages accounted for as ANON can also be FILE_MAPPED */ return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; } /* Work out how many page cache pages we can reclaim in this reclaim_mode */ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) { unsigned long nr_pagecache_reclaimable; unsigned long delta = 0; /* * If RECLAIM_UNMAP is set, then all file pages are considered * potentially reclaimable. Otherwise, we have to worry about * pages like swapcache and node_unmapped_file_pages() provides * a better estimate */ if (node_reclaim_mode & RECLAIM_UNMAP) nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); else nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); /* * Since we can't clean folios through reclaim, remove dirty file * folios from consideration. */ delta += node_page_state(pgdat, NR_FILE_DIRTY); /* Watch for any possible underflows due to delta */ if (unlikely(delta > nr_pagecache_reclaimable)) delta = nr_pagecache_reclaimable; return nr_pagecache_reclaimable - delta; } /* * Try to free up some pages from this node through reclaim. */ static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned long nr_pages, struct scan_control *sc) { struct task_struct *p = current; unsigned int noreclaim_flag; unsigned long pflags; trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, sc->order, sc->gfp_mask); cond_resched(); psi_memstall_enter(&pflags); delayacct_freepages_start(); fs_reclaim_acquire(sc->gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP */ noreclaim_flag = memalloc_noreclaim_save(); set_task_reclaim_state(p, &sc->reclaim_state); if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { /* * Free memory by calling shrink node with increasing * priorities until we have enough memory freed. */ do { shrink_node(pgdat, sc); } while (sc->nr_reclaimed < nr_pages && --sc->priority >= 0); } set_task_reclaim_state(p, NULL); memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc->gfp_mask); delayacct_freepages_end(); psi_memstall_leave(&pflags); trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed); return sc->nr_reclaimed; } int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { int ret; /* Minimum pages needed in order to stay on node */ const unsigned long nr_pages = 1 << order; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = current_gfp_context(gfp_mask), .order = order, .priority = NODE_RECLAIM_PRIORITY, .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, .reclaim_idx = gfp_zone(gfp_mask), }; /* * Node reclaim reclaims unmapped file backed pages and * slab pages if we are over the defined limits. * * A small portion of unmapped file backed pages is needed for * file I/O otherwise pages read by file I/O will be immediately * thrown out if the node is overallocated. So we do not reclaim * if less than a specified percentage of the node is used by * unmapped file backed pages. */ if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) return NODE_RECLAIM_NOSCAN; /* * Only run node reclaim on the local node or on nodes that do not * have associated processors. This will favor the local processor * over remote processors and spread off node memory allocations * as wide as possible. */ if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) return NODE_RECLAIM_NOSCAN; if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) return NODE_RECLAIM_NOSCAN; ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages; clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags); if (ret) count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS); else count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); return ret; } enum { MEMORY_RECLAIM_SWAPPINESS = 0, MEMORY_RECLAIM_SWAPPINESS_MAX, MEMORY_RECLAIM_NULL, }; static const match_table_t tokens = { { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"}, { MEMORY_RECLAIM_NULL, NULL }, }; int user_proactive_reclaim(char *buf, struct mem_cgroup *memcg, pg_data_t *pgdat) { unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; int swappiness = -1; char *old_buf, *start; substring_t args[MAX_OPT_ARGS]; gfp_t gfp_mask = GFP_KERNEL; if (!buf || (!memcg && !pgdat) || (memcg && pgdat)) return -EINVAL; buf = strstrip(buf); old_buf = buf; nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; if (buf == old_buf) return -EINVAL; buf = strstrip(buf); while ((start = strsep(&buf, " ")) != NULL) { if (!strlen(start)) continue; switch (match_token(start, tokens, args)) { case MEMORY_RECLAIM_SWAPPINESS: if (match_int(&args[0], &swappiness)) return -EINVAL; if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS) return -EINVAL; break; case MEMORY_RECLAIM_SWAPPINESS_MAX: swappiness = SWAPPINESS_ANON_ONLY; break; default: return -EINVAL; } } while (nr_reclaimed < nr_to_reclaim) { /* Will converge on zero, but reclaim enforces a minimum */ unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; unsigned long reclaimed; if (signal_pending(current)) return -EINTR; /* * This is the final attempt, drain percpu lru caches in the * hope of introducing more evictable pages. */ if (!nr_retries) lru_add_drain_all(); if (memcg) { unsigned int reclaim_options; reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; reclaimed = try_to_free_mem_cgroup_pages(memcg, batch_size, gfp_mask, reclaim_options, swappiness == -1 ? NULL : &swappiness); } else { struct scan_control sc = { .gfp_mask = current_gfp_context(gfp_mask), .reclaim_idx = gfp_zone(gfp_mask), .proactive_swappiness = swappiness == -1 ? NULL : &swappiness, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX), .may_unmap = 1, .may_swap = 1, .proactive = 1, }; if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) return -EBUSY; reclaimed = __node_reclaim(pgdat, gfp_mask, batch_size, &sc); clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags); } if (!reclaimed && !nr_retries--) return -EAGAIN; nr_reclaimed += reclaimed; } return 0; } #endif /** * check_move_unevictable_folios - Move evictable folios to appropriate zone * lru list * @fbatch: Batch of lru folios to check. * * Checks folios for evictability, if an evictable folio is in the unevictable * lru list, moves it to the appropriate evictable lru list. This function * should be only used for lru folios. */ void check_move_unevictable_folios(struct folio_batch *fbatch) { struct lruvec *lruvec = NULL; int pgscanned = 0; int pgrescued = 0; int i; for (i = 0; i < fbatch->nr; i++) { struct folio *folio = fbatch->folios[i]; int nr_pages = folio_nr_pages(folio); pgscanned += nr_pages; /* block memcg migration while the folio moves between lrus */ if (!folio_test_clear_lru(folio)) continue; lruvec = folio_lruvec_relock_irq(folio, lruvec); if (folio_evictable(folio) && folio_test_unevictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_unevictable(folio); lruvec_add_folio(lruvec, folio); pgrescued += nr_pages; } folio_set_lru(folio); } if (lruvec) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); unlock_page_lruvec_irq(lruvec); } else if (pgscanned) { count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); } } EXPORT_SYMBOL_GPL(check_move_unevictable_folios); #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) static ssize_t reclaim_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ret, nid = dev->id; ret = user_proactive_reclaim((char *)buf, NULL, NODE_DATA(nid)); return ret ? -EAGAIN : count; } static DEVICE_ATTR_WO(reclaim); int reclaim_register_node(struct node *node) { return device_create_file(&node->dev, &dev_attr_reclaim); } void reclaim_unregister_node(struct node *node) { return device_remove_file(&node->dev, &dev_attr_reclaim); } #endif
2 2 2 2 2 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include <linux/firmware.h> #include <linux/sfp.h> #include <net/devlink.h> #include <net/netdev_lock.h> #include "netlink.h" #include "common.h" #include "bitset.h" #include "module_fw.h" struct module_req_info { struct ethnl_req_info base; }; struct module_reply_data { struct ethnl_reply_data base; struct ethtool_module_power_mode_params power; }; #define MODULE_REPDATA(__reply_base) \ container_of(__reply_base, struct module_reply_data, base) /* MODULE_GET */ const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1] = { [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int module_get_power_mode(struct net_device *dev, struct module_reply_data *data, struct netlink_ext_ack *extack) { const struct ethtool_ops *ops = dev->ethtool_ops; if (!ops->get_module_power_mode) return 0; if (dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(extack, "Module firmware flashing is in progress"); return -EBUSY; } return ops->get_module_power_mode(dev, &data->power, extack); } static int module_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct module_reply_data *data = MODULE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = module_get_power_mode(dev, data, info->extack); if (ret < 0) goto out_complete; out_complete: ethnl_ops_complete(dev); return ret; } static int module_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { struct module_reply_data *data = MODULE_REPDATA(reply_base); int len = 0; if (data->power.policy) len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE_POLICY */ if (data->power.mode) len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE */ return len; } static int module_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct module_reply_data *data = MODULE_REPDATA(reply_base); if (data->power.policy && nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE_POLICY, data->power.policy)) return -EMSGSIZE; if (data->power.mode && nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE, data->power.mode)) return -EMSGSIZE; return 0; } /* MODULE_SET */ const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1] = { [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_MODULE_POWER_MODE_POLICY] = NLA_POLICY_RANGE(NLA_U8, ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH, ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO), }; static int ethnl_set_module_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct nlattr **tb = info->attrs; if (!tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]) return 0; if (req_info->dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(info->extack, "Module firmware flashing is in progress"); return -EBUSY; } if (!ops->get_module_power_mode || !ops->set_module_power_mode) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY], "Setting power mode policy is not supported by this device"); return -EOPNOTSUPP; } return 1; } static int ethnl_set_module(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_module_power_mode_params power = {}; struct ethtool_module_power_mode_params power_new; const struct ethtool_ops *ops; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; int ret; ops = dev->ethtool_ops; power_new.policy = nla_get_u8(tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]); ret = ops->get_module_power_mode(dev, &power, info->extack); if (ret < 0) return ret; if (power_new.policy == power.policy) return 0; ret = ops->set_module_power_mode(dev, &power_new, info->extack); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_module_request_ops = { .request_cmd = ETHTOOL_MSG_MODULE_GET, .reply_cmd = ETHTOOL_MSG_MODULE_GET_REPLY, .hdr_attr = ETHTOOL_A_MODULE_HEADER, .req_info_size = sizeof(struct module_req_info), .reply_data_size = sizeof(struct module_reply_data), .prepare_data = module_prepare_data, .reply_size = module_reply_size, .fill_reply = module_fill_reply, .set_validate = ethnl_set_module_validate, .set = ethnl_set_module, .set_ntf_cmd = ETHTOOL_MSG_MODULE_NTF, }; /* MODULE_FW_FLASH_ACT */ const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD + 1] = { [ETHTOOL_A_MODULE_FW_FLASH_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME] = { .type = NLA_NUL_STRING }, [ETHTOOL_A_MODULE_FW_FLASH_PASSWORD] = { .type = NLA_U32 }, }; static LIST_HEAD(module_fw_flash_work_list); static DEFINE_SPINLOCK(module_fw_flash_work_list_lock); static int module_flash_fw_work_list_add(struct ethtool_module_fw_flash *module_fw, struct genl_info *info) { struct ethtool_module_fw_flash *work; /* First, check if already registered. */ spin_lock(&module_fw_flash_work_list_lock); list_for_each_entry(work, &module_fw_flash_work_list, list) { if (work->fw_update.ntf_params.portid == info->snd_portid && work->fw_update.dev == module_fw->fw_update.dev) { spin_unlock(&module_fw_flash_work_list_lock); return -EALREADY; } } list_add_tail(&module_fw->list, &module_fw_flash_work_list); spin_unlock(&module_fw_flash_work_list_lock); return 0; } static void module_flash_fw_work_list_del(struct list_head *list) { spin_lock(&module_fw_flash_work_list_lock); list_del(list); spin_unlock(&module_fw_flash_work_list_lock); } static void module_flash_fw_work(struct work_struct *work) { struct ethtool_module_fw_flash *module_fw; module_fw = container_of(work, struct ethtool_module_fw_flash, work); ethtool_cmis_fw_update(&module_fw->fw_update); module_flash_fw_work_list_del(&module_fw->list); module_fw->fw_update.dev->ethtool->module_fw_flash_in_progress = false; netdev_put(module_fw->fw_update.dev, &module_fw->dev_tracker); release_firmware(module_fw->fw_update.fw); kfree(module_fw); } #define MODULE_EEPROM_PHYS_ID_PAGE 0 #define MODULE_EEPROM_PHYS_ID_I2C_ADDR 0x50 static int module_flash_fw_work_init(struct ethtool_module_fw_flash *module_fw, struct net_device *dev, struct netlink_ext_ack *extack) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_module_eeprom page_data = {}; u8 phys_id; int err; /* Fetch the SFF-8024 Identifier Value. For all supported standards, it * is located at I2C address 0x50, byte 0. See section 4.1 in SFF-8024, * revision 4.9. */ page_data.page = MODULE_EEPROM_PHYS_ID_PAGE; page_data.offset = SFP_PHYS_ID; page_data.length = sizeof(phys_id); page_data.i2c_address = MODULE_EEPROM_PHYS_ID_I2C_ADDR; page_data.data = &phys_id; err = ops->get_module_eeprom_by_page(dev, &page_data, extack); if (err < 0) return err; switch (phys_id) { case SFF8024_ID_QSFP_DD: case SFF8024_ID_OSFP: case SFF8024_ID_DSFP: case SFF8024_ID_QSFP_PLUS_CMIS: case SFF8024_ID_SFP_DD_CMIS: case SFF8024_ID_SFP_PLUS_CMIS: INIT_WORK(&module_fw->work, module_flash_fw_work); break; default: NL_SET_ERR_MSG(extack, "Module type does not support firmware flashing"); return -EOPNOTSUPP; } return 0; } void ethnl_module_fw_flash_sock_destroy(struct ethnl_sock_priv *sk_priv) { struct ethtool_module_fw_flash *work; spin_lock(&module_fw_flash_work_list_lock); list_for_each_entry(work, &module_fw_flash_work_list, list) { if (work->fw_update.dev == sk_priv->dev && work->fw_update.ntf_params.portid == sk_priv->portid) { work->fw_update.ntf_params.closed_sock = true; break; } } spin_unlock(&module_fw_flash_work_list_lock); } static int module_flash_fw_schedule(struct net_device *dev, const char *file_name, struct ethtool_module_fw_flash_params *params, struct sk_buff *skb, struct genl_info *info) { struct ethtool_cmis_fw_update_params *fw_update; struct ethtool_module_fw_flash *module_fw; int err; module_fw = kzalloc(sizeof(*module_fw), GFP_KERNEL); if (!module_fw) return -ENOMEM; fw_update = &module_fw->fw_update; fw_update->params = *params; err = request_firmware_direct(&fw_update->fw, file_name, &dev->dev); if (err) { NL_SET_ERR_MSG(info->extack, "Failed to request module firmware image"); goto err_free; } err = module_flash_fw_work_init(module_fw, dev, info->extack); if (err < 0) goto err_release_firmware; dev->ethtool->module_fw_flash_in_progress = true; netdev_hold(dev, &module_fw->dev_tracker, GFP_KERNEL); fw_update->dev = dev; fw_update->ntf_params.portid = info->snd_portid; fw_update->ntf_params.seq = info->snd_seq; fw_update->ntf_params.closed_sock = false; err = ethnl_sock_priv_set(skb, dev, fw_update->ntf_params.portid, ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH); if (err < 0) goto err_release_firmware; err = module_flash_fw_work_list_add(module_fw, info); if (err < 0) goto err_release_firmware; schedule_work(&module_fw->work); return 0; err_release_firmware: release_firmware(fw_update->fw); err_free: kfree(module_fw); return err; } static int module_flash_fw(struct net_device *dev, struct nlattr **tb, struct sk_buff *skb, struct genl_info *info) { struct ethtool_module_fw_flash_params params = {}; const char *file_name; struct nlattr *attr; if (GENL_REQ_ATTR_CHECK(info, ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME)) return -EINVAL; file_name = nla_data(tb[ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME]); attr = tb[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD]; if (attr) { params.password = cpu_to_be32(nla_get_u32(attr)); params.password_valid = true; } return module_flash_fw_schedule(dev, file_name, &params, skb, info); } static int ethnl_module_fw_flash_validate(struct net_device *dev, struct netlink_ext_ack *extack) { struct devlink_port *devlink_port = dev->devlink_port; const struct ethtool_ops *ops = dev->ethtool_ops; if (!ops->set_module_eeprom_by_page || !ops->get_module_eeprom_by_page) { NL_SET_ERR_MSG(extack, "Flashing module firmware is not supported by this device"); return -EOPNOTSUPP; } if (!ops->reset) { NL_SET_ERR_MSG(extack, "Reset module is not supported by this device, so flashing is not permitted"); return -EOPNOTSUPP; } if (dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(extack, "Module firmware flashing already in progress"); return -EBUSY; } if (dev->flags & IFF_UP) { NL_SET_ERR_MSG(extack, "Netdevice is up, so flashing is not permitted"); return -EBUSY; } if (devlink_port && devlink_port->attrs.split) { NL_SET_ERR_MSG(extack, "Can't perform firmware flashing on a split port"); return -EOPNOTSUPP; } return 0; } int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info) { struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; struct net_device *dev; int ret; ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_MODULE_FW_FLASH_HEADER], genl_info_net(info), info->extack, true); if (ret < 0) return ret; dev = req_info.dev; rtnl_lock(); netdev_lock_ops(dev); ret = ethnl_ops_begin(dev); if (ret < 0) goto out_unlock; ret = ethnl_module_fw_flash_validate(dev, info->extack); if (ret < 0) goto out_unlock; ret = module_flash_fw(dev, tb, skb, info); ethnl_ops_complete(dev); out_unlock: netdev_unlock_ops(dev); rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); return ret; } /* MODULE_FW_FLASH_NTF */ static int ethnl_module_fw_flash_ntf_put_err(struct sk_buff *skb, char *err_msg, char *sub_err_msg) { int err_msg_len, sub_err_msg_len, total_len; struct nlattr *attr; if (!err_msg) return 0; err_msg_len = strlen(err_msg); total_len = err_msg_len + 2; /* For period and NUL. */ if (sub_err_msg) { sub_err_msg_len = strlen(sub_err_msg); total_len += sub_err_msg_len + 2; /* For ", ". */ } attr = nla_reserve(skb, ETHTOOL_A_MODULE_FW_FLASH_STATUS_MSG, total_len); if (!attr) return -ENOMEM; if (sub_err_msg) sprintf(nla_data(attr), "%s, %s.", err_msg, sub_err_msg); else sprintf(nla_data(attr), "%s.", err_msg); return 0; } static void ethnl_module_fw_flash_ntf(struct net_device *dev, enum ethtool_module_fw_flash_status status, struct ethnl_module_fw_flash_ntf_params *ntf_params, char *err_msg, char *sub_err_msg, u64 done, u64 total) { struct sk_buff *skb; void *hdr; int ret; if (ntf_params->closed_sock) return; skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return; hdr = ethnl_unicast_put(skb, ntf_params->portid, ++ntf_params->seq, ETHTOOL_MSG_MODULE_FW_FLASH_NTF); if (!hdr) goto err_skb; ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_MODULE_FW_FLASH_HEADER); if (ret < 0) goto err_skb; if (nla_put_u32(skb, ETHTOOL_A_MODULE_FW_FLASH_STATUS, status)) goto err_skb; ret = ethnl_module_fw_flash_ntf_put_err(skb, err_msg, sub_err_msg); if (ret < 0) goto err_skb; if (nla_put_uint(skb, ETHTOOL_A_MODULE_FW_FLASH_DONE, done)) goto err_skb; if (nla_put_uint(skb, ETHTOOL_A_MODULE_FW_FLASH_TOTAL, total)) goto err_skb; genlmsg_end(skb, hdr); genlmsg_unicast(dev_net(dev), skb, ntf_params->portid); return; err_skb: nlmsg_free(skb); } void ethnl_module_fw_flash_ntf_err(struct net_device *dev, struct ethnl_module_fw_flash_ntf_params *params, char *err_msg, char *sub_err_msg) { ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_ERROR, params, err_msg, sub_err_msg, 0, 0); } void ethnl_module_fw_flash_ntf_start(struct net_device *dev, struct ethnl_module_fw_flash_ntf_params *params) { ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_STARTED, params, NULL, NULL, 0, 0); } void ethnl_module_fw_flash_ntf_complete(struct net_device *dev, struct ethnl_module_fw_flash_ntf_params *params) { ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_COMPLETED, params, NULL, NULL, 0, 0); } void ethnl_module_fw_flash_ntf_in_progress(struct net_device *dev, struct ethnl_module_fw_flash_ntf_params *params, u64 done, u64 total) { ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_IN_PROGRESS, params, NULL, NULL, done, total); }
4 4 4 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2019 Mellanox Technologies. All rights reserved. */ #include <rdma/ib_verbs.h> #include <rdma/rdma_counter.h> #include "core_priv.h" #include "restrack.h" #define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE | RDMA_COUNTER_MASK_PID) static int __counter_set_mode(struct rdma_port_counter *port_counter, enum rdma_nl_counter_mode new_mode, enum rdma_nl_counter_mask new_mask, bool bind_opcnt) { if (new_mode == RDMA_COUNTER_MODE_AUTO) { if (new_mask & (~ALL_AUTO_MODE_MASKS)) return -EINVAL; if (port_counter->num_counters) return -EBUSY; } port_counter->mode.mode = new_mode; port_counter->mode.mask = new_mask; port_counter->mode.bind_opcnt = bind_opcnt; return 0; } /* * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode * * @dev: Device to operate * @port: Port to use * @mask: Mask to configure * @extack: Message to the user * * Return 0 on success. If counter mode wasn't changed then it is considered * as success as well. * Return -EBUSY when changing to auto mode while there are bounded counters. * */ int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mask mask, bool bind_opcnt, struct netlink_ext_ack *extack) { struct rdma_port_counter *port_counter; enum rdma_nl_counter_mode mode; int ret; port_counter = &dev->port_data[port].port_counter; if (!port_counter->hstats) return -EOPNOTSUPP; mutex_lock(&port_counter->lock); if (mask) mode = RDMA_COUNTER_MODE_AUTO; else mode = (port_counter->num_counters) ? RDMA_COUNTER_MODE_MANUAL : RDMA_COUNTER_MODE_NONE; if (port_counter->mode.mode == mode && port_counter->mode.mask == mask && port_counter->mode.bind_opcnt == bind_opcnt) { ret = 0; goto out; } ret = __counter_set_mode(port_counter, mode, mask, bind_opcnt); out: mutex_unlock(&port_counter->lock); if (ret == -EBUSY) NL_SET_ERR_MSG( extack, "Modifying auto mode is not allowed when there is a bound QP"); return ret; } static void auto_mode_init_counter(struct rdma_counter *counter, const struct ib_qp *qp, enum rdma_nl_counter_mask new_mask) { struct auto_mode_param *param = &counter->mode.param; counter->mode.mode = RDMA_COUNTER_MODE_AUTO; counter->mode.mask = new_mask; if (new_mask & RDMA_COUNTER_MASK_QP_TYPE) param->qp_type = qp->qp_type; } static int __rdma_counter_bind_qp(struct rdma_counter *counter, struct ib_qp *qp, u32 port) { int ret; if (qp->counter) return -EINVAL; if (!qp->device->ops.counter_bind_qp) return -EOPNOTSUPP; mutex_lock(&counter->lock); ret = qp->device->ops.counter_bind_qp(counter, qp, port); mutex_unlock(&counter->lock); return ret; } int rdma_counter_modify(struct ib_device *dev, u32 port, unsigned int index, bool enable) { struct rdma_hw_stats *stats; int ret = 0; if (!dev->ops.modify_hw_stat) return -EOPNOTSUPP; stats = ib_get_hw_stats_port(dev, port); if (!stats || index >= stats->num_counters || !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) return -EINVAL; mutex_lock(&stats->lock); if (enable != test_bit(index, stats->is_disabled)) goto out; ret = dev->ops.modify_hw_stat(dev, port, index, enable); if (ret) goto out; if (enable) clear_bit(index, stats->is_disabled); else set_bit(index, stats->is_disabled); out: mutex_unlock(&stats->lock); return ret; } static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, struct ib_qp *qp, enum rdma_nl_counter_mode mode, bool bind_opcnt) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; int ret; if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats) return NULL; counter = rdma_zalloc_drv_obj(dev, rdma_counter); if (!counter) return NULL; counter->device = dev; counter->port = port; dev->ops.counter_init(counter); rdma_restrack_new(&counter->res, RDMA_RESTRACK_COUNTER); counter->stats = dev->ops.counter_alloc_stats(counter); if (!counter->stats) goto err_stats; port_counter = &dev->port_data[port].port_counter; mutex_lock(&port_counter->lock); switch (mode) { case RDMA_COUNTER_MODE_MANUAL: ret = __counter_set_mode(port_counter, RDMA_COUNTER_MODE_MANUAL, 0, bind_opcnt); if (ret) { mutex_unlock(&port_counter->lock); goto err_mode; } break; case RDMA_COUNTER_MODE_AUTO: auto_mode_init_counter(counter, qp, port_counter->mode.mask); break; default: ret = -EOPNOTSUPP; mutex_unlock(&port_counter->lock); goto err_mode; } port_counter->num_counters++; mutex_unlock(&port_counter->lock); counter->mode.mode = mode; counter->mode.bind_opcnt = bind_opcnt; kref_init(&counter->kref); mutex_init(&counter->lock); ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) goto err_mode; rdma_restrack_parent_name(&counter->res, &qp->res); rdma_restrack_add(&counter->res); return counter; err_mode: rdma_free_hw_stats_struct(counter->stats); err_stats: rdma_restrack_put(&counter->res); kfree(counter); return NULL; } static void rdma_counter_free(struct rdma_counter *counter) { struct rdma_port_counter *port_counter; port_counter = &counter->device->port_data[counter->port].port_counter; mutex_lock(&port_counter->lock); port_counter->num_counters--; if (!port_counter->num_counters && (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL)) __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0, false); mutex_unlock(&port_counter->lock); rdma_restrack_del(&counter->res); rdma_free_hw_stats_struct(counter->stats); kfree(counter); } static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, enum rdma_nl_counter_mask auto_mask) { struct auto_mode_param *param = &counter->mode.param; bool match = true; if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE) match &= (param->qp_type == qp->qp_type); if (auto_mask & RDMA_COUNTER_MASK_PID) match &= (task_pid_nr(counter->res.task) == task_pid_nr(qp->res.task)); return match; } static int __rdma_counter_unbind_qp(struct ib_qp *qp, u32 port) { struct rdma_counter *counter = qp->counter; int ret; if (!qp->device->ops.counter_unbind_qp) return -EOPNOTSUPP; mutex_lock(&counter->lock); ret = qp->device->ops.counter_unbind_qp(qp, port); mutex_unlock(&counter->lock); return ret; } static void counter_history_stat_update(struct rdma_counter *counter) { struct ib_device *dev = counter->device; struct rdma_port_counter *port_counter; int i; port_counter = &dev->port_data[counter->port].port_counter; if (!port_counter->hstats) return; rdma_counter_query_stats(counter); for (i = 0; i < counter->stats->num_counters; i++) port_counter->hstats->value[i] += counter->stats->value[i]; } /* * rdma_get_counter_auto_mode - Find the counter that @qp should be bound * with in auto mode * * Return: The counter (with ref-count increased) if found */ static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp, u32 port) { struct rdma_port_counter *port_counter; struct rdma_counter *counter = NULL; struct ib_device *dev = qp->device; struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; unsigned long id = 0; port_counter = &dev->port_data[port].port_counter; rt = &dev->res[RDMA_RESTRACK_COUNTER]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { counter = container_of(res, struct rdma_counter, res); if ((counter->device != qp->device) || (counter->port != port)) goto next; if (auto_mode_match(qp, counter, port_counter->mode.mask)) break; next: counter = NULL; } if (counter && !kref_get_unless_zero(&counter->kref)) counter = NULL; xa_unlock(&rt->xa); return counter; } static void counter_release(struct kref *kref) { struct rdma_counter *counter; counter = container_of(kref, struct rdma_counter, kref); counter_history_stat_update(counter); counter->device->ops.counter_dealloc(counter); rdma_counter_free(counter); } /* * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on * the auto-mode rule */ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port) { struct rdma_port_counter *port_counter; struct ib_device *dev = qp->device; struct rdma_counter *counter; int ret; if (!rdma_restrack_is_tracked(&qp->res) || rdma_is_kernel_res(&qp->res)) return 0; if (!rdma_is_port_valid(dev, port)) return -EINVAL; port_counter = &dev->port_data[port].port_counter; if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) return 0; counter = rdma_get_counter_auto_mode(qp, port); if (counter) { ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) { kref_put(&counter->kref, counter_release); return ret; } } else { counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO, port_counter->mode.bind_opcnt); if (!counter) return -ENOMEM; } return 0; } /* * rdma_counter_unbind_qp - Unbind a qp from a counter * @force: * true - Decrease the counter ref-count anyway (e.g., qp destroy) */ int rdma_counter_unbind_qp(struct ib_qp *qp, u32 port, bool force) { struct rdma_counter *counter = qp->counter; int ret; if (!counter) return -EINVAL; ret = __rdma_counter_unbind_qp(qp, port); if (ret && !force) return ret; kref_put(&counter->kref, counter_release); return 0; } int rdma_counter_query_stats(struct rdma_counter *counter) { struct ib_device *dev = counter->device; int ret; if (!dev->ops.counter_update_stats) return -EINVAL; mutex_lock(&counter->lock); ret = dev->ops.counter_update_stats(counter); mutex_unlock(&counter->lock); return ret; } static u64 get_running_counters_hwstat_sum(struct ib_device *dev, u32 port, u32 index) { struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; struct rdma_counter *counter; unsigned long id = 0; u64 sum = 0; rt = &dev->res[RDMA_RESTRACK_COUNTER]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { if (!rdma_restrack_get(res)) continue; xa_unlock(&rt->xa); counter = container_of(res, struct rdma_counter, res); if ((counter->device != dev) || (counter->port != port) || rdma_counter_query_stats(counter)) goto next; sum += counter->stats->value[index]; next: xa_lock(&rt->xa); rdma_restrack_put(res); } xa_unlock(&rt->xa); return sum; } /* * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a * specific port, including the running ones and history data */ u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u32 port, u32 index) { struct rdma_port_counter *port_counter; u64 sum; port_counter = &dev->port_data[port].port_counter; if (!port_counter->hstats) return 0; sum = get_running_counters_hwstat_sum(dev, port, index); sum += port_counter->hstats->value[index]; return sum; } static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num) { struct rdma_restrack_entry *res = NULL; struct ib_qp *qp = NULL; res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num); if (IS_ERR(res)) return NULL; qp = container_of(res, struct ib_qp, res); if (qp->qp_type == IB_QPT_RAW_PACKET && !rdma_dev_has_raw_cap(dev)) goto err; return qp; err: rdma_restrack_put(res); return NULL; } static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev, u32 counter_id) { struct rdma_restrack_entry *res; struct rdma_counter *counter; res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id); if (IS_ERR(res)) return NULL; counter = container_of(res, struct rdma_counter, res); kref_get(&counter->kref); rdma_restrack_put(res); return counter; } /* * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id */ int rdma_counter_bind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; struct ib_qp *qp; int ret; port_counter = &dev->port_data[port].port_counter; if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO) return -EINVAL; qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; counter = rdma_get_counter_by_id(dev, counter_id); if (!counter) { ret = -ENOENT; goto err; } if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res)) { ret = -EINVAL; goto err_task; } if ((counter->device != qp->device) || (counter->port != qp->port)) { ret = -EINVAL; goto err_task; } ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) goto err_task; rdma_restrack_put(&qp->res); return 0; err_task: kref_put(&counter->kref, counter_release); err: rdma_restrack_put(&qp->res); return ret; } /* * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it * The id of new counter is returned in @counter_id */ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port, u32 qp_num, u32 *counter_id) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; struct ib_qp *qp; int ret; if (!rdma_is_port_valid(dev, port)) return -EINVAL; port_counter = &dev->port_data[port].port_counter; if (!port_counter->hstats) return -EOPNOTSUPP; if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO) return -EINVAL; qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { ret = -EINVAL; goto err; } counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL, true); if (!counter) { ret = -ENOMEM; goto err; } if (counter_id) *counter_id = counter->id; rdma_restrack_put(&qp->res); return 0; err: rdma_restrack_put(&qp->res); return ret; } /* * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter */ int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id) { struct rdma_port_counter *port_counter; struct ib_qp *qp; int ret; if (!rdma_is_port_valid(dev, port)) return -EINVAL; qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { ret = -EINVAL; goto out; } port_counter = &dev->port_data[port].port_counter; if (!qp->counter || qp->counter->id != counter_id || port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) { ret = -EINVAL; goto out; } ret = rdma_counter_unbind_qp(qp, port, false); out: rdma_restrack_put(&qp->res); return ret; } int rdma_counter_get_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mode *mode, enum rdma_nl_counter_mask *mask, bool *opcnt) { struct rdma_port_counter *port_counter; port_counter = &dev->port_data[port].port_counter; *mode = port_counter->mode.mode; *mask = port_counter->mode.mask; *opcnt = port_counter->mode.bind_opcnt; return 0; } void rdma_counter_init(struct ib_device *dev) { struct rdma_port_counter *port_counter; u32 port, i; if (!dev->port_data) return; rdma_for_each_port(dev, port) { port_counter = &dev->port_data[port].port_counter; port_counter->mode.mode = RDMA_COUNTER_MODE_NONE; mutex_init(&port_counter->lock); if (!dev->ops.alloc_hw_port_stats) continue; port_counter->hstats = dev->ops.alloc_hw_port_stats(dev, port); if (!port_counter->hstats) goto fail; } return; fail: for (i = port; i >= rdma_start_port(dev); i--) { port_counter = &dev->port_data[port].port_counter; rdma_free_hw_stats_struct(port_counter->hstats); port_counter->hstats = NULL; mutex_destroy(&port_counter->lock); } } void rdma_counter_release(struct ib_device *dev) { struct rdma_port_counter *port_counter; u32 port; rdma_for_each_port(dev, port) { port_counter = &dev->port_data[port].port_counter; rdma_free_hw_stats_struct(port_counter->hstats); mutex_destroy(&port_counter->lock); } }
4 4 4 4 4 4 4 4 4 4 150 149 149 150 4 1 1 1 1 1 4 4 149 150 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 /* * linux/fs/nls/nls_base.c * * Native language support--charsets and unicode translations. * By Gordon Chaffee 1996, 1997 * * Unicode based case conversion 1999 by Wolfram Pienkoss * */ #include <linux/module.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/kmod.h> #include <linux/spinlock.h> #include <asm/byteorder.h> static struct nls_table default_table; static struct nls_table *tables = &default_table; static DEFINE_SPINLOCK(nls_lock); /* * Sample implementation from Unicode home page. * http://www.stonehand.com/unicode/standard/fss-utf.html */ struct utf8_table { int cmask; int cval; int shift; long lmask; long lval; }; static const struct utf8_table utf8_table[] = { {0x80, 0x00, 0*6, 0x7F, 0, /* 1 byte sequence */}, {0xE0, 0xC0, 1*6, 0x7FF, 0x80, /* 2 byte sequence */}, {0xF0, 0xE0, 2*6, 0xFFFF, 0x800, /* 3 byte sequence */}, {0xF8, 0xF0, 3*6, 0x1FFFFF, 0x10000, /* 4 byte sequence */}, {0xFC, 0xF8, 4*6, 0x3FFFFFF, 0x200000, /* 5 byte sequence */}, {0xFE, 0xFC, 5*6, 0x7FFFFFFF, 0x4000000, /* 6 byte sequence */}, {0, /* end of table */} }; #define UNICODE_MAX 0x0010ffff #define PLANE_SIZE 0x00010000 #define SURROGATE_MASK 0xfffff800 #define SURROGATE_PAIR 0x0000d800 #define SURROGATE_LOW 0x00000400 #define SURROGATE_BITS 0x000003ff int utf8_to_utf32(const u8 *s, int inlen, unicode_t *pu) { unsigned long l; int c0, c, nc; const struct utf8_table *t; nc = 0; c0 = *s; l = c0; for (t = utf8_table; t->cmask; t++) { nc++; if ((c0 & t->cmask) == t->cval) { l &= t->lmask; if (l < t->lval || l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR) return -EILSEQ; *pu = (unicode_t) l; return nc; } if (inlen <= nc) return -EOVERFLOW; s++; c = (*s ^ 0x80) & 0xFF; if (c & 0xC0) return -EILSEQ; l = (l << 6) | c; } return -EILSEQ; } EXPORT_SYMBOL(utf8_to_utf32); int utf32_to_utf8(unicode_t u, u8 *s, int maxout) { unsigned long l; int c, nc; const struct utf8_table *t; if (!s) return 0; l = u; if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR) return -EILSEQ; nc = 0; for (t = utf8_table; t->cmask && maxout; t++, maxout--) { nc++; if (l <= t->lmask) { c = t->shift; *s = (u8) (t->cval | (l >> c)); while (c > 0) { c -= 6; s++; *s = (u8) (0x80 | ((l >> c) & 0x3F)); } return nc; } } return -EOVERFLOW; } EXPORT_SYMBOL(utf32_to_utf8); static inline void put_utf16(wchar_t *s, unsigned c, enum utf16_endian endian) { switch (endian) { default: *s = (wchar_t) c; break; case UTF16_LITTLE_ENDIAN: *s = __cpu_to_le16(c); break; case UTF16_BIG_ENDIAN: *s = __cpu_to_be16(c); break; } } int utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian, wchar_t *pwcs, int maxout) { u16 *op; int size; unicode_t u; op = pwcs; while (inlen > 0 && maxout > 0 && *s) { if (*s & 0x80) { size = utf8_to_utf32(s, inlen, &u); if (size < 0) return -EINVAL; s += size; inlen -= size; if (u >= PLANE_SIZE) { if (maxout < 2) break; u -= PLANE_SIZE; put_utf16(op++, SURROGATE_PAIR | ((u >> 10) & SURROGATE_BITS), endian); put_utf16(op++, SURROGATE_PAIR | SURROGATE_LOW | (u & SURROGATE_BITS), endian); maxout -= 2; } else { put_utf16(op++, u, endian); maxout--; } } else { put_utf16(op++, *s++, endian); inlen--; maxout--; } } return op - pwcs; } EXPORT_SYMBOL(utf8s_to_utf16s); static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian) { switch (endian) { default: return c; case UTF16_LITTLE_ENDIAN: return __le16_to_cpu(c); case UTF16_BIG_ENDIAN: return __be16_to_cpu(c); } } int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian, u8 *s, int maxout) { u8 *op; int size; unsigned long u, v; op = s; while (inlen > 0 && maxout > 0) { u = get_utf16(*pwcs, endian); if (!u) break; pwcs++; inlen--; if (u > 0x7f) { if ((u & SURROGATE_MASK) == SURROGATE_PAIR) { if (u & SURROGATE_LOW) { /* Ignore character and move on */ continue; } if (inlen <= 0) break; v = get_utf16(*pwcs, endian); if ((v & SURROGATE_MASK) != SURROGATE_PAIR || !(v & SURROGATE_LOW)) { /* Ignore character and move on */ continue; } u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10) + (v & SURROGATE_BITS); pwcs++; inlen--; } size = utf32_to_utf8(u, op, maxout); if (size < 0) { if (size == -EILSEQ) { /* Ignore character and move on */ continue; } /* * Stop filling the buffer with data once a character * does not fit anymore. */ break; } else { op += size; maxout -= size; } } else { *op++ = (u8) u; maxout--; } } return op - s; } EXPORT_SYMBOL(utf16s_to_utf8s); int __register_nls(struct nls_table *nls, struct module *owner) { struct nls_table ** tmp = &tables; if (nls->next) return -EBUSY; nls->owner = owner; spin_lock(&nls_lock); while (*tmp) { if (nls == *tmp) { spin_unlock(&nls_lock); return -EBUSY; } tmp = &(*tmp)->next; } nls->next = tables; tables = nls; spin_unlock(&nls_lock); return 0; } EXPORT_SYMBOL(__register_nls); int unregister_nls(struct nls_table * nls) { struct nls_table ** tmp = &tables; spin_lock(&nls_lock); while (*tmp) { if (nls == *tmp) { *tmp = nls->next; spin_unlock(&nls_lock); return 0; } tmp = &(*tmp)->next; } spin_unlock(&nls_lock); return -EINVAL; } static struct nls_table *find_nls(const char *charset) { struct nls_table *nls; spin_lock(&nls_lock); for (nls = tables; nls; nls = nls->next) { if (!strcmp(nls->charset, charset)) break; if (nls->alias && !strcmp(nls->alias, charset)) break; } if (nls && !try_module_get(nls->owner)) nls = NULL; spin_unlock(&nls_lock); return nls; } struct nls_table *load_nls(const char *charset) { return try_then_request_module(find_nls(charset), "nls_%s", charset); } void unload_nls(struct nls_table *nls) { if (nls) module_put(nls->owner); } static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* 0x90*/ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* 0xa0*/ 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* 0xb0*/ 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, /* 0xc0*/ 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* 0xd0*/ 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, /* 0xe0*/ 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* 0xf0*/ 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char *const page_uni2charset[256] = { page00 }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table default_table = { .charset = "default", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; /* Returns a simple default translation table */ struct nls_table *load_nls_default(void) { struct nls_table *default_nls; default_nls = load_nls(CONFIG_NLS_DEFAULT); if (default_nls != NULL) return default_nls; else return &default_table; } EXPORT_SYMBOL(unregister_nls); EXPORT_SYMBOL(unload_nls); EXPORT_SYMBOL(load_nls); EXPORT_SYMBOL(load_nls_default); MODULE_DESCRIPTION("Base file system native language support"); MODULE_LICENSE("Dual BSD/GPL");
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/icmpv6.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/spinlock.h> #include <net/ipv6.h> #if IS_ENABLED(CONFIG_IPV6) #if !IS_BUILTIN(CONFIG_IPV6) static ip6_icmp_send_t __rcu *ip6_icmp_send; int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { return (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, NULL, fn) == NULL) ? 0 : -EBUSY; } EXPORT_SYMBOL(inet6_register_icmp_sender); int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) { int ret; ret = (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, fn, NULL) == fn) ? 0 : -EINVAL; synchronize_net(); return ret; } EXPORT_SYMBOL(inet6_unregister_icmp_sender); void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm) { ip6_icmp_send_t *send; rcu_read_lock(); send = rcu_dereference(ip6_icmp_send); if (send) send(skb, type, code, info, NULL, parm); rcu_read_unlock(); } EXPORT_SYMBOL(__icmpv6_send); #endif #if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_conntrack.h> void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { struct inet6_skb_parm parm = { 0 }; struct sk_buff *cloned_skb = NULL; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; struct in6_addr orig_ip; struct nf_conn *ct; ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) { __icmpv6_send(skb_in, type, code, info, &parm); return; } if (skb_shared(skb_in)) skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || (skb_network_header(skb_in) + sizeof(struct ipv6hdr)) > skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, skb_network_offset(skb_in) + sizeof(struct ipv6hdr)))) goto out; orig_ip = ipv6_hdr(skb_in)->saddr; dir = CTINFO2DIR(ctinfo); ipv6_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.in6; __icmpv6_send(skb_in, type, code, info, &parm); ipv6_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); } EXPORT_SYMBOL(icmpv6_ndo_send); #endif #endif
1 1 1 1 1 5 6 5 1 6 9 5 5 160 162 162 159 162 3 3 3 3 3 3 3 3 3 162 162 161 161 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 5 5 5 5 5 5 6 6 6 6 6 6 5 6 6 6 6 6 6 6 6 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 9 8 9 9 6 7 6 6 5 5 5 5 6 6 1 1 1 5 6 6 6 1 1 1 1 7 1 7 7 6 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mmu_notifier.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/mm_inline.h> #include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> #include <linux/mman.h> #include <linux/hashtable.h> #include <linux/userfaultfd_k.h> #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate_wait.h> #include <linux/leafops.h> #include <linux/shmem_fs.h> #include <linux/dax.h> #include <linux/ksm.h> #include <linux/pgalloc.h> #include <asm/tlb.h> #include "internal.h" #include "mm_slot.h" enum scan_result { SCAN_FAIL, SCAN_SUCCEED, SCAN_NO_PTE_TABLE, SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, SCAN_PTE_MAPPED_HUGEPAGE, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, SCAN_SCAN_ABORT, SCAN_PAGE_COUNT, SCAN_PAGE_LRU, SCAN_PAGE_LOCK, SCAN_PAGE_ANON, SCAN_PAGE_COMPOUND, SCAN_ANY_PROCESS, SCAN_VMA_NULL, SCAN_VMA_CHECK, SCAN_ADDRESS_RANGE, SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, SCAN_TRUNCATED, SCAN_PAGE_HAS_PRIVATE, SCAN_STORE_FAILED, SCAN_COPY_MC, SCAN_PAGE_FILLED, }; #define CREATE_TRACE_POINTS #include <trace/events/huge_memory.h> static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); /* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; static unsigned int khugepaged_full_scans; static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; /* during fragmentation poll the hugepage allocator once every minute */ static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; static unsigned long khugepaged_sleep_expire; static DEFINE_SPINLOCK(khugepaged_mm_lock); static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); /* * default collapse hugepages if there is at least one pte mapped like * it would have happened if the vma was large enough during page * fault. * * Note that these are only respected if collapse was initiated by khugepaged. */ unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_shared __read_mostly; #define MM_SLOTS_HASH_BITS 10 static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct kmem_cache *mm_slot_cache __ro_after_init; struct collapse_control { bool is_khugepaged; /* Num pages scanned per node */ u32 node_load[MAX_NUMNODES]; /* nodemask for allocation fallback */ nodemask_t alloc_nmask; }; /** * struct khugepaged_scan - cursor for scanning * @mm_head: the head of the mm list to scan * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned * * There is only the one khugepaged_scan instance of this cursor structure. */ struct khugepaged_scan { struct list_head mm_head; struct mm_slot *mm_slot; unsigned long address; }; static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; #ifdef CONFIG_SYSFS static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); } static ssize_t __sleep_millisecs_store(const char *buf, size_t count, unsigned int *millisecs) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; *millisecs = msecs; khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; } static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { return __sleep_millisecs_store(buf, count, &khugepaged_scan_sleep_millisecs); } static struct kobj_attribute scan_sleep_millisecs_attr = __ATTR_RW(scan_sleep_millisecs); static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs); } static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { return __sleep_millisecs_store(buf, count, &khugepaged_alloc_sleep_millisecs); } static struct kobj_attribute alloc_sleep_millisecs_attr = __ATTR_RW(alloc_sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int pages; int err; err = kstrtouint(buf, 10, &pages); if (err || !pages) return -EINVAL; khugepaged_pages_to_scan = pages; return count; } static struct kobj_attribute pages_to_scan_attr = __ATTR_RW(pages_to_scan); static ssize_t pages_collapsed_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed); } static struct kobj_attribute pages_collapsed_attr = __ATTR_RO(pages_collapsed); static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_full_scans); } static struct kobj_attribute full_scans_attr = __ATTR_RO(full_scans); static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return single_hugepage_flag_show(kobj, attr, buf, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } static struct kobj_attribute khugepaged_defrag_attr = __ATTR_RW(defrag); /* * max_ptes_none controls if khugepaged should collapse hugepages over * any unmapped ptes in turn potentially increasing the memory * footprint of the vmas. When max_ptes_none is 0 khugepaged will not * reduce the available free memory in the system as it * runs. Increasing max_ptes_none will instead potentially reduce the * free memory in the system during the khugepaged scan. */ static ssize_t max_ptes_none_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none); } static ssize_t max_ptes_none_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long max_ptes_none; err = kstrtoul(buf, 10, &max_ptes_none); if (err || max_ptes_none > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_none = max_ptes_none; return count; } static struct kobj_attribute khugepaged_max_ptes_none_attr = __ATTR_RW(max_ptes_none); static ssize_t max_ptes_swap_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap); } static ssize_t max_ptes_swap_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long max_ptes_swap; err = kstrtoul(buf, 10, &max_ptes_swap); if (err || max_ptes_swap > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_swap = max_ptes_swap; return count; } static struct kobj_attribute khugepaged_max_ptes_swap_attr = __ATTR_RW(max_ptes_swap); static ssize_t max_ptes_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared); } static ssize_t max_ptes_shared_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long max_ptes_shared; err = kstrtoul(buf, 10, &max_ptes_shared); if (err || max_ptes_shared > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_shared = max_ptes_shared; return count; } static struct kobj_attribute khugepaged_max_ptes_shared_attr = __ATTR_RW(max_ptes_shared); static struct attribute *khugepaged_attr[] = { &khugepaged_defrag_attr.attr, &khugepaged_max_ptes_none_attr.attr, &khugepaged_max_ptes_swap_attr.attr, &khugepaged_max_ptes_shared_attr.attr, &pages_to_scan_attr.attr, &pages_collapsed_attr.attr, &full_scans_attr.attr, &scan_sleep_millisecs_attr.attr, &alloc_sleep_millisecs_attr.attr, NULL, }; struct attribute_group khugepaged_attr_group = { .attrs = khugepaged_attr, .name = "khugepaged", }; #endif /* CONFIG_SYSFS */ static bool pte_none_or_zero(pte_t pte) { if (pte_none(pte)) return true; return pte_present(pte) && is_zero_pfn(pte_pfn(pte)); } int hugepage_madvise(struct vm_area_struct *vma, vm_flags_t *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: #ifdef CONFIG_S390 /* * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 * can't handle this properly after s390_enable_sie, so we simply * ignore the madvise to prevent qemu from causing a SIGSEGV. */ if (mm_has_pgste(vma->vm_mm)) return 0; #endif *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; /* * If the vma become good for khugepaged to scan, * register it here without waiting a page fault that * may not happen any time soon. */ khugepaged_enter_vma(vma, *vm_flags); break; case MADV_NOHUGEPAGE: *vm_flags &= ~VM_HUGEPAGE; *vm_flags |= VM_NOHUGEPAGE; /* * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning * this vma even if we leave the mm registered in khugepaged if * it got registered before VM_NOHUGEPAGE was set. */ break; } return 0; } int __init khugepaged_init(void) { mm_slot_cache = KMEM_CACHE(mm_slot, 0); if (!mm_slot_cache) return -ENOMEM; khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; return 0; } void __init khugepaged_destroy(void) { kmem_cache_destroy(mm_slot_cache); } static inline int hpage_collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); } static bool hugepage_pmd_enabled(void) { /* * We cover the anon, shmem and the file-backed case here; file-backed * hugepages, when configured in, are determined by the global control. * Anon pmd-sized hugepages are determined by the pmd-size control. * Shmem pmd-sized hugepages are also determined by its pmd-size control, * except when the global shmem_huge is set to SHMEM_HUGE_DENY. */ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && hugepage_global_enabled()) return true; if (test_bit(PMD_ORDER, &huge_anon_orders_always)) return true; if (test_bit(PMD_ORDER, &huge_anon_orders_madvise)) return true; if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) && hugepage_global_enabled()) return true; if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled()) return true; return false; } void __khugepaged_enter(struct mm_struct *mm) { struct mm_slot *slot; int wakeup; /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) return; slot = mm_slot_alloc(mm_slot_cache); if (!slot) return; spin_lock(&khugepaged_mm_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* * Insert just behind the scanning cursor, to let the area settle * down a little. */ wakeup = list_empty(&khugepaged_scan.mm_head); list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); spin_unlock(&khugepaged_mm_lock); mmgrab(mm); if (wakeup) wake_up_interruptible(&khugepaged_wait); } void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags) { if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_pmd_enabled()) { if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } void __khugepaged_exit(struct mm_struct *mm) { struct mm_slot *slot; int free = 0; spin_lock(&khugepaged_mm_lock); slot = mm_slot_lookup(mm_slots_hash, mm); if (slot && khugepaged_scan.mm_slot != slot) { hash_del(&slot->hash); list_del(&slot->mm_node); free = 1; } spin_unlock(&khugepaged_mm_lock); if (free) { mm_flags_clear(MMF_VM_HUGEPAGE, mm); mm_slot_free(mm_slot_cache, slot); mmdrop(mm); } else if (slot) { /* * This is required to serialize against * hpage_collapse_test_exit() (which is guaranteed to run * under mmap sem read mode). Stop here (after we return all * pagetables will be destroyed) until khugepaged has finished * working on the pagetables under the mmap_lock. */ mmap_write_lock(mm); mmap_write_unlock(mm); } } static void release_pte_folio(struct folio *folio) { node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), -folio_nr_pages(folio)); folio_unlock(folio); folio_putback_lru(folio); } static void release_pte_pages(pte_t *pte, pte_t *_pte, struct list_head *compound_pagelist) { struct folio *folio, *tmp; while (--_pte >= pte) { pte_t pteval = ptep_get(_pte); unsigned long pfn; if (pte_none(pteval)) continue; VM_WARN_ON_ONCE(!pte_present(pteval)); pfn = pte_pfn(pteval); if (is_zero_pfn(pfn)) continue; folio = pfn_folio(pfn); if (folio_test_large(folio)) continue; release_pte_folio(folio); } list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { list_del(&folio->lru); release_pte_folio(folio); } } static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long start_addr, pte_t *pte, struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; struct folio *folio = NULL; unsigned long addr = start_addr; pte_t *_pte; int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (pte_none_or_zero(pteval)) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out; } } if (!pte_present(pteval)) { result = SCAN_PTE_NON_PRESENT; goto out; } if (pte_uffd_wp(pteval)) { result = SCAN_PTE_UFFD_WP; goto out; } page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out; } folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); /* See hpage_collapse_scan_pmd(). */ if (folio_maybe_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out; } } if (folio_test_large(folio)) { struct folio *f; /* * Check if we have dealt with the compound page * already */ list_for_each_entry(f, compound_pagelist, lru) { if (folio == f) goto next; } } /* * We can do it before folio_isolate_lru because the * folio can't be freed from under us. NOTE: PG_lock * is needed to serialize against split_huge_page * when invoked from the VM. */ if (!folio_trylock(folio)) { result = SCAN_PAGE_LOCK; goto out; } /* * Check if the page has any GUP (or other external) pins. * * The page table that maps the page has been already unlinked * from the page table tree and this process cannot get * an additional pin on the page. * * New pins can come later if the page is shared across fork, * but not from this process. The other process cannot write to * the page, only trigger CoW. */ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { folio_unlock(folio); result = SCAN_PAGE_COUNT; goto out; } /* * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. */ if (!folio_isolate_lru(folio)) { folio_unlock(folio); result = SCAN_DEL_PAGE_LRU; goto out; } node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (folio_test_large(folio)) list_add_tail(&folio->lru, compound_pagelist); next: /* * If collapse was initiated by khugepaged, check that there is * enough young pte to justify collapsing the page */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; } if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(folio, none_or_zero, referenced, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(folio, none_or_zero, referenced, result); return result; } static void __collapse_huge_page_copy_succeeded(pte_t *pte, struct vm_area_struct *vma, unsigned long address, spinlock_t *ptl, struct list_head *compound_pagelist) { unsigned long end = address + HPAGE_PMD_SIZE; struct folio *src, *tmp; pte_t pteval; pte_t *_pte; unsigned int nr_ptes; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes, address += nr_ptes * PAGE_SIZE) { nr_ptes = 1; pteval = ptep_get(_pte); if (pte_none_or_zero(pteval)) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); if (pte_none(pteval)) continue; /* * ptl mostly unnecessary. */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); spin_unlock(ptl); ksm_might_unmap_zero_page(vma->vm_mm, pteval); } else { struct page *src_page = pte_page(pteval); src = page_folio(src_page); if (folio_test_large(src)) { unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT; nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes); } else { release_pte_folio(src); } /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats * inside folio_remove_rmap_pte(). */ spin_lock(ptl); clear_ptes(vma->vm_mm, address, _pte, nr_ptes); folio_remove_rmap_ptes(src, src_page, nr_ptes, vma); spin_unlock(ptl); free_swap_cache(src); folio_put_refs(src, nr_ptes); } } list_for_each_entry_safe(src, tmp, compound_pagelist, lru) { list_del(&src->lru); node_stat_sub_folio(src, NR_ISOLATED_ANON + folio_is_file_lru(src)); folio_unlock(src); free_swap_cache(src); folio_putback_lru(src); } } static void __collapse_huge_page_copy_failed(pte_t *pte, pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, struct list_head *compound_pagelist) { spinlock_t *pmd_ptl; /* * Re-establish the PMD to point to the original page table * entry. Restoring PMD needs to be done prior to releasing * pages. Since pages are still isolated and locked here, * acquiring anon_vma_lock_write is unnecessary. */ pmd_ptl = pmd_lock(vma->vm_mm, pmd); pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd)); spin_unlock(pmd_ptl); /* * Release both raw and compound pages isolated * in __collapse_huge_page_isolate. */ release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist); } /* * __collapse_huge_page_copy - attempts to copy memory contents from raw * pages to a hugepage. Cleans up the raw pages if copying succeeds; * otherwise restores the original page table and releases isolated raw pages. * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. * * @pte: starting of the PTEs to copy from * @folio: the new hugepage to copy contents to * @pmd: pointer to the new hugepage's PMD * @orig_pmd: the original raw pages' PMD * @vma: the original raw pages' virtual memory area * @address: starting address to copy * @ptl: lock on raw pages' PTEs * @compound_pagelist: list that stores compound pages */ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, unsigned long address, spinlock_t *ptl, struct list_head *compound_pagelist) { unsigned int i; int result = SCAN_SUCCEED; /* * Copying pages' contents is subject to memory poison at any iteration. */ for (i = 0; i < HPAGE_PMD_NR; i++) { pte_t pteval = ptep_get(pte + i); struct page *page = folio_page(folio, i); unsigned long src_addr = address + i * PAGE_SIZE; struct page *src_page; if (pte_none_or_zero(pteval)) { clear_user_highpage(page, src_addr); continue; } src_page = pte_page(pteval); if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) { result = SCAN_COPY_MC; break; } } if (likely(result == SCAN_SUCCEED)) __collapse_huge_page_copy_succeeded(pte, vma, address, ptl, compound_pagelist); else __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma, compound_pagelist); return result; } static void khugepaged_alloc_sleep(void) { DEFINE_WAIT(wait); add_wait_queue(&khugepaged_wait, &wait); __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); remove_wait_queue(&khugepaged_wait, &wait); } struct collapse_control khugepaged_collapse_control = { .is_khugepaged = true, }; static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) { int i; /* * If node_reclaim_mode is disabled, then no extra effort is made to * allocate memory locally. */ if (!node_reclaim_enabled()) return false; /* If there is a count for this node already, it must be acceptable */ if (cc->node_load[nid]) return false; for (i = 0; i < MAX_NUMNODES; i++) { if (!cc->node_load[i]) continue; if (node_distance(nid, i) > node_reclaim_distance) return true; } return false; } #define khugepaged_defrag() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)) /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) { return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT; } #ifdef CONFIG_NUMA static int hpage_collapse_find_target_node(struct collapse_control *cc) { int nid, target_node = 0, max_value = 0; /* find first node with max normal pages hit */ for (nid = 0; nid < MAX_NUMNODES; nid++) if (cc->node_load[nid] > max_value) { max_value = cc->node_load[nid]; target_node = nid; } for_each_online_node(nid) { if (max_value == cc->node_load[nid]) node_set(nid, cc->alloc_nmask); } return target_node; } #else static int hpage_collapse_find_target_node(struct collapse_control *cc) { return 0; } #endif /* * If mmap_lock temporarily dropped, revalidate vma * before taking mmap_lock. * Returns enum scan_result value. */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc) { struct vm_area_struct *vma; enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); if (!vma) return SCAN_VMA_NULL; if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then * remapped to file after khugepaged reaquired the mmap_lock. * * thp_vma_allowable_order may return true for qualified file * vmas. */ if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) return SCAN_PAGE_ANON; return SCAN_SUCCEED; } static inline int check_pmd_state(pmd_t *pmd) { pmd_t pmde = pmdp_get_lockless(pmd); if (pmd_none(pmde)) return SCAN_NO_PTE_TABLE; /* * The folio may be under migration when khugepaged is trying to * collapse it. Migration success or failure will eventually end * up with a present PMD mapping a folio again. */ if (pmd_is_migration_entry(pmde)) return SCAN_PMD_MAPPED; if (!pmd_present(pmde)) return SCAN_NO_PTE_TABLE; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; if (pmd_bad(pmde)) return SCAN_NO_PTE_TABLE; return SCAN_SUCCEED; } static int find_pmd_or_thp_or_none(struct mm_struct *mm, unsigned long address, pmd_t **pmd) { *pmd = mm_find_pmd(mm, address); if (!*pmd) return SCAN_NO_PTE_TABLE; return check_pmd_state(*pmd); } static int check_pmd_still_valid(struct mm_struct *mm, unsigned long address, pmd_t *pmd) { pmd_t *new_pmd; int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); if (result != SCAN_SUCCEED) return result; if (new_pmd != pmd) return SCAN_FAIL; return SCAN_SUCCEED; } /* * Bring missing pages in from swap, to complete THP collapse. * Only done if hpage_collapse_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. */ static int __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd, int referenced) { int swapped_in = 0; vm_fault_t ret = 0; unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE); int result; pte_t *pte = NULL; spinlock_t *ptl; for (addr = start_addr; addr < end; addr += PAGE_SIZE) { struct vm_fault vmf = { .vma = vma, .address = addr, .pgoff = linear_page_index(vma, addr), .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, }; if (!pte++) { /* * Here the ptl is only used to check pte_same() in * do_swap_page(), so readonly version is enough. */ pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); if (!pte) { mmap_read_unlock(mm); result = SCAN_NO_PTE_TABLE; goto out; } } vmf.orig_pte = ptep_get_lockless(pte); if (pte_none(vmf.orig_pte) || pte_present(vmf.orig_pte)) continue; vmf.pte = pte; vmf.ptl = ptl; ret = do_swap_page(&vmf); /* Which unmaps pte (after perhaps re-checking the entry) */ pte = NULL; /* * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because * we do not retry here and swap entry will remain in pagetable * resulting in later failure. */ if (ret & VM_FAULT_RETRY) { /* Likely, but not guaranteed, that page lock failed */ result = SCAN_PAGE_LOCK; goto out; } if (ret & VM_FAULT_ERROR) { mmap_read_unlock(mm); result = SCAN_FAIL; goto out; } swapped_in++; } if (pte) pte_unmap(pte); /* Drain LRU cache to remove extra pin on the swapped in pages */ if (swapped_in) lru_add_drain(); result = SCAN_SUCCEED; out: trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result); return result; } static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, struct collapse_control *cc) { gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : GFP_TRANSHUGE); int node = hpage_collapse_find_target_node(cc); struct folio *folio; folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask); if (!folio) { *foliop = NULL; count_vm_event(THP_COLLAPSE_ALLOC_FAILED); return SCAN_ALLOC_HUGE_PAGE_FAIL; } count_vm_event(THP_COLLAPSE_ALLOC); if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { folio_put(folio); *foliop = NULL; return SCAN_CGROUP_CHARGE_FAIL; } count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1); *foliop = folio; return SCAN_SUCCEED; } static int collapse_huge_page(struct mm_struct *mm, unsigned long address, int referenced, int unmapped, struct collapse_control *cc) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; struct folio *folio; spinlock_t *pmd_ptl, *pte_ptl; int result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves * sync compaction, and we do not need to hold the mmap_lock during * that. We will recheck the vma after taking it again in write mode. */ mmap_read_unlock(mm); result = alloc_charge_folio(&folio, mm, cc); if (result != SCAN_SUCCEED) goto out_nolock; mmap_read_lock(mm); result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } result = find_pmd_or_thp_or_none(mm, address, &pmd); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } if (unmapped) { /* * __collapse_huge_page_swapin will return with mmap_lock * released when it fails. So we jump out_nolock directly in * that case. Continuing to collapse causes inconsistency. */ result = __collapse_huge_page_swapin(mm, vma, address, pmd, referenced); if (result != SCAN_SUCCEED) goto out_nolock; } mmap_read_unlock(mm); /* * Prevent all access to pagetables with the exception of * gup_fast later handled by the ptep_clear_flush and the VM * handled by the anon_vma lock + PG_lock. * * UFFDIO_MOVE is prevented to race as well thanks to the * mmap_lock. */ mmap_write_lock(mm); result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ vma_start_write(vma); result = check_pmd_still_valid(mm, address, pmd); if (result != SCAN_SUCCEED) goto out_up_write; anon_vma_lock_write(vma->anon_vma); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * This removes any huge TLB entry from the CPU so we won't allow * huge and small TLB entries for the same virtual address to * avoid the risk of CPU bugs in that area. * * Parallel GUP-fast is fine since GUP-fast will back off when * it detects PMD is changed. */ _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); tlb_remove_table_sync_one(); pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); if (pte) { result = __collapse_huge_page_isolate(vma, address, pte, cc, &compound_pagelist); spin_unlock(pte_ptl); } else { result = SCAN_NO_PTE_TABLE; } if (unlikely(result != SCAN_SUCCEED)) { if (pte) pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); /* * We can only use set_pmd_at when establishing * hugepmds and never for establishing regular pmds that * points to regular pagetables. Use pmd_populate for that */ pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); goto out_up_write; } /* * All pages are isolated and locked so anon_vma rmap * can't run anymore. */ anon_vma_unlock_write(vma->anon_vma); result = __collapse_huge_page_copy(pte, folio, pmd, _pmd, vma, address, pte_ptl, &compound_pagelist); pte_unmap(pte); if (unlikely(result != SCAN_SUCCEED)) goto out_up_write; /* * The smp_wmb() inside __folio_mark_uptodate() ensures the * copy_huge_page writes become visible before the set_pmd_at() * write. */ __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); pgtable_trans_huge_deposit(mm, pmd, pgtable); map_anon_folio_pmd_nopf(folio, pmd, vma, address); spin_unlock(pmd_ptl); folio = NULL; result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); out_nolock: if (folio) folio_put(folio); trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); return result; } static int hpage_collapse_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked, struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; int result = SCAN_FAIL, referenced = 0; int none_or_zero = 0, shared = 0; struct page *page = NULL; struct folio *folio = NULL; unsigned long addr; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK); result = find_pmd_or_thp_or_none(mm, start_addr, &pmd); if (result != SCAN_SUCCEED) goto out; memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); if (!pte) { result = SCAN_NO_PTE_TABLE; goto out; } for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (pte_none_or_zero(pteval)) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out_unmap; } } if (!pte_present(pteval)) { ++unmapped; if (!cc->is_khugepaged || unmapped <= khugepaged_max_ptes_swap) { /* * Always be strict with uffd-wp * enabled swap entries. Please see * comment below for pte_uffd_wp(). */ if (pte_swp_uffd_wp_any(pteval)) { result = SCAN_PTE_UFFD_WP; goto out_unmap; } continue; } else { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); goto out_unmap; } } if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small * PTEs are armed with uffd write protection. * Here we can also mark the new huge pmd as * write protected if any of the small ones is * marked but that could bring unknown * userfault messages that falls outside of * the registered range. So, just be simple. */ result = SCAN_PTE_UFFD_WP; goto out_unmap; } page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out_unmap; } folio = page_folio(page); if (!folio_test_anon(folio)) { result = SCAN_PAGE_ANON; goto out_unmap; } /* * We treat a single page as shared if any part of the THP * is shared. */ if (folio_maybe_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; } } /* * Record which node the original page is from and save this * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } cc->node_load[node]++; if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; goto out_unmap; } if (folio_test_locked(folio)) { result = SCAN_PAGE_LOCK; goto out_unmap; } /* * Check if the page has any GUP (or other external) pins. * * Here the check may be racy: * it may see folio_mapcount() > folio_ref_count(). * But such case is ephemeral we could always retry collapse * later. However it may report false positive if the page * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low. */ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; goto out_unmap; } /* * If collapse was initiated by khugepaged, check that there is * enough young pte to justify collapsing the page */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; } if (cc->is_khugepaged && (!referenced || (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; } out_unmap: pte_unmap_unlock(pte, ptl); if (result == SCAN_SUCCEED) { result = collapse_huge_page(mm, start_addr, referenced, unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ *mmap_locked = false; } out: trace_mm_khugepaged_scan_pmd(mm, folio, referenced, none_or_zero, result, unmapped); return result; } static void collect_mm_slot(struct mm_slot *slot) { struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ hash_del(&slot->hash); list_del(&slot->mm_node); /* * Not strictly needed because the mm exited already. * * mm_flags_clear(MMF_VM_HUGEPAGE, mm); */ /* khugepaged_mm_lock actually not necessary for the below */ mm_slot_free(mm_slot_cache, slot); mmdrop(mm); } } /* folio must be locked, and mmap_lock must be held */ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio, struct page *page) { struct mm_struct *mm = vma->vm_mm; struct vm_fault vmf = { .vma = vma, .address = addr, .flags = 0, }; pgd_t *pgdp; p4d_t *p4dp; pud_t *pudp; mmap_assert_locked(vma->vm_mm); if (!pmdp) { pgdp = pgd_offset(mm, addr); p4dp = p4d_alloc(mm, pgdp, addr); if (!p4dp) return SCAN_FAIL; pudp = pud_alloc(mm, p4dp, addr); if (!pudp) return SCAN_FAIL; pmdp = pmd_alloc(mm, pudp, addr); if (!pmdp) return SCAN_FAIL; } vmf.pmd = pmdp; if (do_set_pmd(&vmf, folio, page)) return SCAN_FAIL; folio_get(folio); return SCAN_SUCCEED; } /** * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at * address haddr. * * @mm: process address space where collapse happens * @addr: THP collapse address * @install_pmd: If a huge PMD should be installed * * This function checks whether all the PTEs in the PMD are pointing to the * right THP. If so, retract the page table so the THP can refault in with * as pmd-mapped. Possibly install a huge PMD mapping the THP. */ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd) { int nr_mapped_ptes = 0, result = SCAN_FAIL; unsigned int nr_batch_ptes; struct mmu_notifier_range range; bool notified = false; unsigned long haddr = addr & HPAGE_PMD_MASK; unsigned long end = haddr + HPAGE_PMD_SIZE; struct vm_area_struct *vma = vma_lookup(mm, haddr); struct folio *folio; pte_t *start_pte, *pte; pmd_t *pmd, pgt_pmd; spinlock_t *pml = NULL, *ptl; int i; mmap_assert_locked(mm); /* First check VMA found, in case page tables are being torn down */ if (!vma || !vma->vm_file || !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) return SCAN_VMA_CHECK; /* Fast check before locking page if already PMD-mapped */ result = find_pmd_or_thp_or_none(mm, haddr, &pmd); if (result == SCAN_PMD_MAPPED) return result; /* * If we are here, we've succeeded in replacing all the native pages * in the page cache with a single hugepage. If a mm were to fault-in * this memory (mapped by a suitably aligned VMA), we'd get the hugepage * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here and force collapse. */ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ if (userfaultfd_wp(vma)) return SCAN_PTE_UFFD_WP; folio = filemap_lock_folio(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); if (IS_ERR(folio)) return SCAN_PAGE_NULL; if (folio_order(folio) != HPAGE_PMD_ORDER) { result = SCAN_PAGE_COMPOUND; goto drop_folio; } result = find_pmd_or_thp_or_none(mm, haddr, &pmd); switch (result) { case SCAN_SUCCEED: break; case SCAN_NO_PTE_TABLE: /* * All pte entries have been removed and pmd cleared. * Skip all the pte checks and just update the pmd mapping. */ goto maybe_install_pmd; default: goto drop_folio; } result = SCAN_FAIL; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto drop_folio; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; pte_t ptent = ptep_get(pte); /* empty pte, skip */ if (pte_none(ptent)) continue; /* page swapped out, abort */ if (!pte_present(ptent)) { result = SCAN_PTE_NON_PRESENT; goto abort; } page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page))) page = NULL; /* * Note that uprobe, debugger, or MAP_PRIVATE may change the * page table, but the new page will not be a subpage of hpage. */ if (folio_page(folio, i) != page) goto abort; } pte_unmap_unlock(start_pte, ptl); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); notified = true; /* * pmd_lock covers a wider range than ptl, and (if split from mm's * page_table_lock) ptl nests inside pml. The less time we hold pml, * the better; but userfaultfd's mfill_atomic_pte() on a private VMA * inserts a valid as-if-COWed PTE without even looking up page cache. * So page lock of folio does not protect from it, so we must not drop * ptl before pgt_pmd is removed, so uffd private needs pml taken now. */ if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) pml = pmd_lock(mm, pmd); start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto abort; if (!pml) spin_lock(ptl); else if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) goto abort; /* step 2: clear page table and adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE, pte += nr_batch_ptes) { unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT; struct page *page; pte_t ptent = ptep_get(pte); nr_batch_ptes = 1; if (pte_none(ptent)) continue; /* * We dropped ptl after the first scan, to do the mmu_notifier: * page lock stops more PTEs of the folio being faulted in, but * does not stop write faults COWing anon copies from existing * PTEs; and does not stop those being swapped out or migrated. */ if (!pte_present(ptent)) { result = SCAN_PTE_NON_PRESENT; goto abort; } page = vm_normal_page(vma, addr, ptent); if (folio_page(folio, i) != page) goto abort; nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes); /* * Must clear entry, or a racing truncate may re-remove it. * TLB flush can be left until pmdp_collapse_flush() does it. * PTE dirty? Shmem page is already dirty; file is read-only. */ clear_ptes(mm, addr, pte, nr_batch_ptes); folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma); nr_mapped_ptes += nr_batch_ptes; } if (!pml) spin_unlock(ptl); /* step 3: set proper refcount and mm_counters. */ if (nr_mapped_ptes) { folio_ref_sub(folio, nr_mapped_ptes); add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); } /* step 4: remove empty page table */ if (!pml) { pml = pmd_lock(mm, pmd); if (ptl != pml) { spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { flush_tlb_mm(mm); goto unlock; } } } pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); pmdp_get_lockless_sync(); pte_unmap_unlock(start_pte, ptl); if (ptl != pml) spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, haddr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); maybe_install_pmd: /* step 5: install pmd entry */ result = install_pmd ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page) : SCAN_SUCCEED; goto drop_folio; abort: if (nr_mapped_ptes) { flush_tlb_mm(mm); folio_ref_sub(folio, nr_mapped_ptes); add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); } unlock: if (start_pte) pte_unmap_unlock(start_pte, ptl); if (pml && pml != ptl) spin_unlock(pml); if (notified) mmu_notifier_invalidate_range_end(&range); drop_folio: folio_unlock(folio); folio_put(folio); return result; } /* Can we retract page tables for this file-backed VMA? */ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) { /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that * got written to. These VMAs are likely not worth removing * page tables from, as PMD-mapping is likely to be split later. */ if (READ_ONCE(vma->anon_vma)) return false; /* * When a vma is registered with uffd-wp, we cannot recycle * the page table because there may be pte markers installed. * Other vmas can still have the same file mapped hugely, but * skip this one: it will always be mapped in small page size * for uffd-wp registered ranges. */ if (userfaultfd_wp(vma)) return false; /* * If the VMA contains guard regions then we can't collapse it. * * This is set atomically on guard marker installation under mmap/VMA * read lock, and here we may not hold any VMA or mmap lock at all. * * This is therefore serialised on the PTE page table lock, which is * obtained on guard region installation after the flag is set, so this * check being performed under this lock excludes races. */ if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT)) return false; return true; } static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { struct mmu_notifier_range range; struct mm_struct *mm; unsigned long addr; pmd_t *pmd, pgt_pmd; spinlock_t *pml; spinlock_t *ptl; bool success = false; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (addr & ~HPAGE_PMD_MASK || vma->vm_end < addr + HPAGE_PMD_SIZE) continue; mm = vma->vm_mm; if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) continue; if (hpage_collapse_test_exit(mm)) continue; if (!file_backed_vma_is_retractable(vma)) continue; /* PTEs were notified when unmapped; but now for the PMD? */ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pml = pmd_lock(mm, pmd); /* * The lock of new_folio is still held, we will be blocked in * the page fault path, which prevents the pte entries from * being set again. So even though the old empty PTE page may be * concurrently freed and a new PTE page is filled into the pmd * entry, it is still empty and can be removed. * * So here we only need to recheck if the state of pmd entry * still meets our requirements, rather than checking pmd_same() * like elsewhere. */ if (check_pmd_state(pmd) != SCAN_SUCCEED) goto drop_pml; ptl = pte_lockptr(mm, pmd); if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); /* * Huge page lock is still held, so normally the page table must * remain empty; and we have already skipped anon_vma and * userfaultfd_wp() vmas. But since the mmap_lock is not held, * it is still possible for a racing userfaultfd_ioctl() or * madvise() to have inserted ptes or markers. Now that we hold * ptlock, repeating the retractable checks protects us from * races against the prior checks. */ if (likely(file_backed_vma_is_retractable(vma))) { pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); pmdp_get_lockless_sync(); success = true; } if (ptl != pml) spin_unlock(ptl); drop_pml: spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); if (success) { mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); } } i_mmap_unlock_read(mapping); } /** * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. * * @mm: process address space where collapse happens * @addr: virtual collapse start address * @file: file that collapse on * @start: collapse start address * @cc: collapse context and scratchpad * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; * - scan page cache, locking old pages * + swap/gup in pages if necessary; * - copy data to new page * - handle shmem holes * + re-validate that holes weren't filled by someone else * + check for userfaultfd * - finalize updates to the page cache; * - if replacing succeeds: * + unlock huge page; * + free old pages; * - if replacing failed; * + unlock old pages * + unlock and free huge page; */ static int collapse_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; struct page *dst; struct folio *folio, *tmp, *new_folio; pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); result = alloc_charge_folio(&new_folio, mm, cc); if (result != SCAN_SUCCEED) goto out; mapping_set_update(&xas, mapping); __folio_set_locked(new_folio); if (is_shmem) __folio_set_swapbacked(new_folio); new_folio->index = start; new_folio->mapping = mapping; /* * Ensure we have slots for all the pages in the range. This is * almost certainly a no-op because most of the pages must be present */ do { xas_lock_irq(&xas); xas_create_range(&xas); if (!xas_error(&xas)) break; xas_unlock_irq(&xas); if (!xas_nomem(&xas, GFP_KERNEL)) { result = SCAN_FAIL; goto rollback; } } while (1); for (index = start; index < end;) { xas_set(&xas, index); folio = xas_load(&xas); VM_BUG_ON(index != xas.xa_index); if (is_shmem) { if (!folio) { /* * Stop if extent has been truncated or * hole-punched, and is now completely * empty. */ if (index == start) { if (!xas_next_entry(&xas, end - 1)) { result = SCAN_TRUNCATED; goto xa_locked; } } nr_none++; index++; continue; } if (xa_is_value(folio) || !folio_test_uptodate(folio)) { xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_get_folio(mapping->host, index, 0, &folio, SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } /* drain lru cache to help folio_isolate_lru() */ lru_add_drain(); } else if (folio_trylock(folio)) { folio_get(folio); xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; goto xa_locked; } } else { /* !is_shmem */ if (!folio || xa_is_value(folio)) { xas_unlock_irq(&xas); page_cache_sync_readahead(mapping, &file->f_ra, file, index, end - index); /* drain lru cache to help folio_isolate_lru() */ lru_add_drain(); folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { result = SCAN_FAIL; goto xa_unlocked; } } else if (folio_test_dirty(folio)) { /* * khugepaged only works on read-only fd, * so this page is dirty because it hasn't * been flushed since first write. There * won't be new dirty pages. * * Trigger async flush here and hope the * writeback is done when khugepaged * revisits this page. * * This is a one-off situation. We are not * forcing writeback in loop. */ xas_unlock_irq(&xas); filemap_flush(mapping); result = SCAN_FAIL; goto xa_unlocked; } else if (folio_test_writeback(folio)) { xas_unlock_irq(&xas); result = SCAN_FAIL; goto xa_unlocked; } else if (folio_trylock(folio)) { folio_get(folio); xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; goto xa_locked; } } /* * The folio must be locked, so we can drop the i_pages lock * without racing with truncate. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); /* make sure the folio is up to date */ if (unlikely(!folio_test_uptodate(folio))) { result = SCAN_FAIL; goto out_unlock; } /* * If file was truncated then extended, or hole-punched, before * we locked the first folio, then a THP might be there already. * This will be discovered on the first iteration. */ if (folio_order(folio) == HPAGE_PMD_ORDER && folio->index == start) { /* Maybe PMD-mapped */ result = SCAN_PTE_MAPPED_HUGEPAGE; goto out_unlock; } if (folio_mapping(folio) != mapping) { result = SCAN_TRUNCATED; goto out_unlock; } if (!is_shmem && (folio_test_dirty(folio) || folio_test_writeback(folio))) { /* * khugepaged only works on read-only fd, so this * folio is dirty because it hasn't been flushed * since first write. */ result = SCAN_FAIL; goto out_unlock; } if (!folio_isolate_lru(folio)) { result = SCAN_DEL_PAGE_LRU; goto out_unlock; } if (!filemap_release_folio(folio, GFP_KERNEL)) { result = SCAN_PAGE_HAS_PRIVATE; folio_putback_lru(folio); goto out_unlock; } if (folio_mapped(folio)) try_to_unmap(folio, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio); /* * We control 2 + nr_pages references to the folio: * - we hold a pin on it; * - nr_pages reference from page cache; * - one from lru_isolate_folio; * If those are the only references, then any new usage * of the folio will have to fetch it from the page * cache. That requires locking the folio to handle * truncate, so any new usage will be blocked until we * unlock folio after collapse/during rollback. */ if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) { result = SCAN_PAGE_COUNT; xas_unlock_irq(&xas); folio_putback_lru(folio); goto out_unlock; } /* * Accumulate the folios that are being collapsed. */ list_add_tail(&folio->lru, &pagelist); index += folio_nr_pages(folio); continue; out_unlock: folio_unlock(folio); folio_put(folio); goto xa_unlocked; } if (!is_shmem) { filemap_nr_thps_inc(mapping); /* * Paired with the fence in do_dentry_open() -> get_write_access() * to ensure i_writecount is up to date and the update to nr_thps * is visible. Ensures the page cache will be truncated if the * file is opened writable. */ smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; filemap_nr_thps_dec(mapping); } } xa_locked: xas_unlock_irq(&xas); xa_unlocked: /* * If collapse is successful, flush must be done now before copying. * If collapse is unsuccessful, does flush actually need to be done? * Do it anyway, to clear the state. */ try_to_unmap_flush(); if (result == SCAN_SUCCEED && nr_none && !shmem_charge(mapping->host, nr_none)) result = SCAN_FAIL; if (result != SCAN_SUCCEED) { nr_none = 0; goto rollback; } /* * The old folios are locked, so they won't change anymore. */ index = start; dst = folio_page(new_folio, 0); list_for_each_entry(folio, &pagelist, lru) { int i, nr_pages = folio_nr_pages(folio); while (index < folio->index) { clear_highpage(dst); index++; dst++; } for (i = 0; i < nr_pages; i++) { if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) { result = SCAN_COPY_MC; goto rollback; } index++; dst++; } } while (index < end) { clear_highpage(dst); index++; dst++; } if (nr_none) { struct vm_area_struct *vma; int nr_none_check = 0; i_mmap_lock_read(mapping); xas_lock_irq(&xas); xas_set(&xas, start); for (index = start; index < end; index++) { if (!xas_next(&xas)) { xas_store(&xas, XA_RETRY_ENTRY); if (xas_error(&xas)) { result = SCAN_STORE_FAILED; goto immap_locked; } nr_none_check++; } } if (nr_none != nr_none_check) { result = SCAN_PAGE_FILLED; goto immap_locked; } /* * If userspace observed a missing page in a VMA with * a MODE_MISSING userfaultfd, then it might expect a * UFFD_EVENT_PAGEFAULT for that page. If so, we need to * roll back to avoid suppressing such an event. Since * wp/minor userfaultfds don't give userspace any * guarantees that the kernel doesn't fill a missing * page with a zero page, so they don't matter here. * * Any userfaultfds registered after this point will * not be able to observe any missing pages due to the * previously inserted retry entries. */ vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) { if (userfaultfd_missing(vma)) { result = SCAN_EXCEED_NONE_PTE; goto immap_locked; } } immap_locked: i_mmap_unlock_read(mapping); if (result != SCAN_SUCCEED) { xas_set(&xas, start); for (index = start; index < end; index++) { if (xas_next(&xas) == XA_RETRY_ENTRY) xas_store(&xas, NULL); } xas_unlock_irq(&xas); goto rollback; } } else { xas_lock_irq(&xas); } if (is_shmem) lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); else lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); if (nr_none) { lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); } /* * Mark new_folio as uptodate before inserting it into the * page cache so that it isn't mistaken for an fallocated but * unwritten page. */ folio_mark_uptodate(new_folio); folio_ref_add(new_folio, HPAGE_PMD_NR - 1); if (is_shmem) folio_mark_dirty(new_folio); folio_add_lru(new_folio); /* Join all the small entries into a single multi-index entry. */ xas_set_order(&xas, start, HPAGE_PMD_ORDER); xas_store(&xas, new_folio); WARN_ON_ONCE(xas_error(&xas)); xas_unlock_irq(&xas); /* * Remove pte page tables, so we can re-fault the page as huge. * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp(). */ retract_page_tables(mapping, start); if (cc && !cc->is_khugepaged) result = SCAN_PTE_MAPPED_HUGEPAGE; folio_unlock(new_folio); /* * The collapse has succeeded, so free the old folios. */ list_for_each_entry_safe(folio, tmp, &pagelist, lru) { list_del(&folio->lru); folio->mapping = NULL; folio_clear_active(folio); folio_clear_unevictable(folio); folio_unlock(folio); folio_put_refs(folio, 2 + folio_nr_pages(folio)); } goto out; rollback: /* Something went wrong: roll back page cache changes */ if (nr_none) { xas_lock_irq(&xas); mapping->nrpages -= nr_none; xas_unlock_irq(&xas); shmem_uncharge(mapping->host, nr_none); } list_for_each_entry_safe(folio, tmp, &pagelist, lru) { list_del(&folio->lru); folio_unlock(folio); folio_putback_lru(folio); folio_put(folio); } /* * Undo the updates of filemap_nr_thps_inc for non-SHMEM * file only. This undo is not needed unless failure is * due to SCAN_COPY_MC. */ if (!is_shmem && result == SCAN_COPY_MC) { filemap_nr_thps_dec(mapping); /* * Paired with the fence in do_dentry_open() -> get_write_access() * to ensure the update to nr_thps is visible. */ smp_mb(); } new_folio->mapping = NULL; folio_unlock(new_folio); folio_put(new_folio); out: VM_BUG_ON(!list_empty(&pagelist)); trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result); return result; } static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { struct folio *folio = NULL; struct address_space *mapping = file->f_mapping; XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; int result = SCAN_SUCCEED; present = 0; swap = 0; memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); rcu_read_lock(); xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, folio)) continue; if (xa_is_value(folio)) { swap += 1 << xas_get_order(&xas); if (cc->is_khugepaged && swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; } continue; } if (!folio_try_get(folio)) { xas_reset(&xas); continue; } if (unlikely(folio != xas_reload(&xas))) { folio_put(folio); xas_reset(&xas); continue; } if (folio_order(folio) == HPAGE_PMD_ORDER && folio->index == start) { /* Maybe PMD-mapped */ result = SCAN_PTE_MAPPED_HUGEPAGE; /* * For SCAN_PTE_MAPPED_HUGEPAGE, further processing * by the caller won't touch the page cache, and so * it's safe to skip LRU and refcount checks before * returning. */ folio_put(folio); break; } node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; folio_put(folio); break; } cc->node_load[node]++; if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; folio_put(folio); break; } if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; folio_put(folio); break; } /* * We probably should check if the folio is referenced * here, but nobody would transfer pte_young() to * folio_test_referenced() for us. And rmap walk here * is just too costly... */ present += folio_nr_pages(folio); folio_put(folio); if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); if (result == SCAN_SUCCEED) { if (cc->is_khugepaged && present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { result = collapse_file(mm, addr, file, start, cc); } } trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); return result; } static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { struct vma_iterator vmi; struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int progress = 0; VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); *result = SCAN_FAIL; if (khugepaged_scan.mm_slot) { slot = khugepaged_scan.mm_slot; } else { slot = list_first_entry(&khugepaged_scan.mm_head, struct mm_slot, mm_node); khugepaged_scan.address = 0; khugepaged_scan.mm_slot = slot; } spin_unlock(&khugepaged_mm_lock); mm = slot->mm; /* * Don't wait for semaphore (to avoid long wait times). Just move to * the next mm on the list. */ vma = NULL; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; progress++; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; vma_iter_init(&vmi, mm, khugepaged_scan.address); for_each_vma(vmi, vma) { unsigned long hstart, hend; cond_resched(); if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { progress++; break; } if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { skip: progress++; continue; } hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); if (khugepaged_scan.address > hend) goto skip; if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { bool mmap_locked = true; cond_resched(); if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); mmap_read_unlock(mm); mmap_locked = false; *result = hpage_collapse_scan_file(mm, khugepaged_scan.address, file, pgoff, cc); fput(file); if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { mmap_read_lock(mm); if (hpage_collapse_test_exit_or_disable(mm)) goto breakouterloop; *result = collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); if (*result == SCAN_PMD_MAPPED) *result = SCAN_SUCCEED; mmap_read_unlock(mm); } } else { *result = hpage_collapse_scan_pmd(mm, vma, khugepaged_scan.address, &mmap_locked, cc); } if (*result == SCAN_SUCCEED) ++khugepaged_pages_collapsed; /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; if (!mmap_locked) /* * We released mmap_lock so break loop. Note * that we drop mmap_lock before all hugepage * allocations, so if allocation fails, we are * guaranteed to break here and report the * correct result back to caller. */ goto breakouterloop_mmap_lock; if (progress >= pages) goto breakouterloop; } } breakouterloop: mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */ breakouterloop_mmap_lock: spin_lock(&khugepaged_mm_lock); VM_BUG_ON(khugepaged_scan.mm_slot != slot); /* * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ if (hpage_collapse_test_exit(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) { khugepaged_scan.mm_slot = list_next_entry(slot, mm_node); khugepaged_scan.address = 0; } else { khugepaged_scan.mm_slot = NULL; khugepaged_full_scans++; } collect_mm_slot(slot); } return progress; } static int khugepaged_has_work(void) { return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled(); } static int khugepaged_wait_event(void) { return !list_empty(&khugepaged_scan.mm_head) || kthread_should_stop(); } static void khugepaged_do_scan(struct collapse_control *cc) { unsigned int progress = 0, pass_through_head = 0; unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; int result = SCAN_SUCCEED; lru_add_drain_all(); while (true) { cond_resched(); if (unlikely(kthread_should_stop())) break; spin_lock(&khugepaged_mm_lock); if (!khugepaged_scan.mm_slot) pass_through_head++; if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, &result, cc); else progress = pages; spin_unlock(&khugepaged_mm_lock); if (progress >= pages) break; if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { /* * If fail to allocate the first time, try to sleep for * a while. When hit again, cancel the scan. */ if (!wait) break; wait = false; khugepaged_alloc_sleep(); } } } static bool khugepaged_should_wakeup(void) { return kthread_should_stop() || time_after_eq(jiffies, khugepaged_sleep_expire); } static void khugepaged_wait_work(void) { if (khugepaged_has_work()) { const unsigned long scan_sleep_jiffies = msecs_to_jiffies(khugepaged_scan_sleep_millisecs); if (!scan_sleep_jiffies) return; khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; wait_event_freezable_timeout(khugepaged_wait, khugepaged_should_wakeup(), scan_sleep_jiffies); return; } if (hugepage_pmd_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } static int khugepaged(void *none) { struct mm_slot *slot; set_freezable(); set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { khugepaged_do_scan(&khugepaged_collapse_control); khugepaged_wait_work(); } spin_lock(&khugepaged_mm_lock); slot = khugepaged_scan.mm_slot; khugepaged_scan.mm_slot = NULL; if (slot) collect_mm_slot(slot); spin_unlock(&khugepaged_mm_lock); return 0; } static void set_recommended_min_free_kbytes(void) { struct zone *zone; int nr_zones = 0; unsigned long recommended_min; if (!hugepage_pmd_enabled()) { calculate_min_free_kbytes(); goto update_wmarks; } for_each_populated_zone(zone) { /* * We don't need to worry about fragmentation of * ZONE_MOVABLE since it only has movable pages. */ if (zone_idx(zone) > gfp_zone(GFP_USER)) continue; nr_zones++; } /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; /* * Make sure that on average at least two pageblocks are almost free * of another type, one for a migratetype to fall back to and a * second to avoid subsequent fallbacks of other types There are 3 * MIGRATE_TYPES we care about. */ recommended_min += pageblock_nr_pages * nr_zones * MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; /* don't ever allow to reserve more than 5% of the lowmem */ recommended_min = min(recommended_min, (unsigned long) nr_free_buffer_pages() / 20); recommended_min <<= (PAGE_SHIFT-10); if (recommended_min > min_free_kbytes) { if (user_min_free_kbytes >= 0) pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", min_free_kbytes, recommended_min); min_free_kbytes = recommended_min; } update_wmarks: setup_per_zone_wmarks(); } int start_stop_khugepaged(void) { int err = 0; mutex_lock(&khugepaged_mutex); if (hugepage_pmd_enabled()) { if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); if (IS_ERR(khugepaged_thread)) { pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; goto fail; } if (!list_empty(&khugepaged_scan.mm_head)) wake_up_interruptible(&khugepaged_wait); } else if (khugepaged_thread) { kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } set_recommended_min_free_kbytes(); fail: mutex_unlock(&khugepaged_mutex); return err; } void khugepaged_min_free_kbytes_update(void) { mutex_lock(&khugepaged_mutex); if (hugepage_pmd_enabled() && khugepaged_thread) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } bool current_is_khugepaged(void) { return kthread_func(current) == khugepaged; } static int madvise_collapse_errno(enum scan_result r) { /* * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide * actionable feedback to caller, so they may take an appropriate * fallback measure depending on the nature of the failure. */ switch (r) { case SCAN_ALLOC_HUGE_PAGE_FAIL: return -ENOMEM; case SCAN_CGROUP_CHARGE_FAIL: case SCAN_EXCEED_NONE_PTE: return -EBUSY; /* Resource temporary unavailable - trying again might succeed */ case SCAN_PAGE_COUNT: case SCAN_PAGE_LOCK: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: case SCAN_PAGE_FILLED: return -EAGAIN; /* * Other: Trying again likely not to succeed / error intrinsic to * specified memory range. khugepaged likely won't be able to collapse * either. */ default: return -EINVAL; } } int madvise_collapse(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool *lock_dropped) { struct collapse_control *cc; struct mm_struct *mm = vma->vm_mm; unsigned long hstart, hend, addr; int thps = 0, last_fail = SCAN_FAIL; bool mmap_locked = true; BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); if (!cc) return -ENOMEM; cc->is_khugepaged = false; mmgrab(mm); lru_add_drain_all(); hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = end & HPAGE_PMD_MASK; for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { int result = SCAN_FAIL; if (!mmap_locked) { cond_resched(); mmap_read_lock(mm); mmap_locked = true; result = hugepage_vma_revalidate(mm, addr, false, &vma, cc); if (result != SCAN_SUCCEED) { last_fail = result; goto out_nolock; } hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); } mmap_assert_locked(mm); if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); mmap_read_unlock(mm); mmap_locked = false; result = hpage_collapse_scan_file(mm, addr, file, pgoff, cc); fput(file); } else { result = hpage_collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc); } if (!mmap_locked) *lock_dropped = true; handle_result: switch (result) { case SCAN_SUCCEED: case SCAN_PMD_MAPPED: ++thps; break; case SCAN_PTE_MAPPED_HUGEPAGE: BUG_ON(mmap_locked); mmap_read_lock(mm); result = collapse_pte_mapped_thp(mm, addr, true); mmap_read_unlock(mm); goto handle_result; /* Whitelisted set of results where continuing OK */ case SCAN_NO_PTE_TABLE: case SCAN_PTE_NON_PRESENT: case SCAN_PTE_UFFD_WP: case SCAN_LACK_REFERENCED_PAGE: case SCAN_PAGE_NULL: case SCAN_PAGE_COUNT: case SCAN_PAGE_LOCK: case SCAN_PAGE_COMPOUND: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: last_fail = result; break; default: last_fail = result; /* Other error, exit */ goto out_maybelock; } } out_maybelock: /* Caller expects us to hold mmap_lock on return */ if (!mmap_locked) mmap_read_lock(mm); out_nolock: mmap_assert_locked(mm); mmdrop(mm); kfree(cc); return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 : madvise_collapse_errno(last_fail); }
19 19 15 19 19 5 5 5 5 3 3 3 3 7 11 12 2 12 11 2 8 5 8 10 7 6 5 11 6 6 6 3 11 1 1 1 1 2 1 1 1 8 9 9 1 1 5 4 8 7 4 8 6 8 8 9 8 1 3 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 // SPDX-License-Identifier: GPL-2.0-or-later /* * MIDI byte <-> sequencer event coder * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de>, * Jaroslav Kysela <perex@perex.cz> */ #include <linux/slab.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/module.h> #include <sound/core.h> #include <sound/seq_kernel.h> #include <sound/seq_midi_event.h> #include <sound/asoundef.h> MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>, Jaroslav Kysela <perex@perex.cz>"); MODULE_DESCRIPTION("MIDI byte <-> sequencer event coder"); MODULE_LICENSE("GPL"); /* event type, index into status_event[] */ /* from 0 to 6 are normal commands (note off, on, etc.) for 0x9?-0xe? */ #define ST_INVALID 7 #define ST_SPECIAL 8 #define ST_SYSEX ST_SPECIAL /* from 8 to 15 are events for 0xf0-0xf7 */ /* * prototypes */ static void note_event(struct snd_midi_event *dev, struct snd_seq_event *ev); static void one_param_ctrl_event(struct snd_midi_event *dev, struct snd_seq_event *ev); static void pitchbend_ctrl_event(struct snd_midi_event *dev, struct snd_seq_event *ev); static void two_param_ctrl_event(struct snd_midi_event *dev, struct snd_seq_event *ev); static void one_param_event(struct snd_midi_event *dev, struct snd_seq_event *ev); static void songpos_event(struct snd_midi_event *dev, struct snd_seq_event *ev); static void note_decode(struct snd_seq_event *ev, unsigned char *buf); static void one_param_decode(struct snd_seq_event *ev, unsigned char *buf); static void pitchbend_decode(struct snd_seq_event *ev, unsigned char *buf); static void two_param_decode(struct snd_seq_event *ev, unsigned char *buf); static void songpos_decode(struct snd_seq_event *ev, unsigned char *buf); /* * event list */ static struct status_event_list { int event; int qlen; void (*encode)(struct snd_midi_event *dev, struct snd_seq_event *ev); void (*decode)(struct snd_seq_event *ev, unsigned char *buf); } status_event[] = { /* 0x80 - 0xef */ {SNDRV_SEQ_EVENT_NOTEOFF, 2, note_event, note_decode}, {SNDRV_SEQ_EVENT_NOTEON, 2, note_event, note_decode}, {SNDRV_SEQ_EVENT_KEYPRESS, 2, note_event, note_decode}, {SNDRV_SEQ_EVENT_CONTROLLER, 2, two_param_ctrl_event, two_param_decode}, {SNDRV_SEQ_EVENT_PGMCHANGE, 1, one_param_ctrl_event, one_param_decode}, {SNDRV_SEQ_EVENT_CHANPRESS, 1, one_param_ctrl_event, one_param_decode}, {SNDRV_SEQ_EVENT_PITCHBEND, 2, pitchbend_ctrl_event, pitchbend_decode}, /* invalid */ {SNDRV_SEQ_EVENT_NONE, -1, NULL, NULL}, /* 0xf0 - 0xff */ {SNDRV_SEQ_EVENT_SYSEX, 1, NULL, NULL}, /* sysex: 0xf0 */ {SNDRV_SEQ_EVENT_QFRAME, 1, one_param_event, one_param_decode}, /* 0xf1 */ {SNDRV_SEQ_EVENT_SONGPOS, 2, songpos_event, songpos_decode}, /* 0xf2 */ {SNDRV_SEQ_EVENT_SONGSEL, 1, one_param_event, one_param_decode}, /* 0xf3 */ {SNDRV_SEQ_EVENT_NONE, -1, NULL, NULL}, /* 0xf4 */ {SNDRV_SEQ_EVENT_NONE, -1, NULL, NULL}, /* 0xf5 */ {SNDRV_SEQ_EVENT_TUNE_REQUEST, 0, NULL, NULL}, /* 0xf6 */ {SNDRV_SEQ_EVENT_NONE, -1, NULL, NULL}, /* 0xf7 */ {SNDRV_SEQ_EVENT_CLOCK, 0, NULL, NULL}, /* 0xf8 */ {SNDRV_SEQ_EVENT_NONE, -1, NULL, NULL}, /* 0xf9 */ {SNDRV_SEQ_EVENT_START, 0, NULL, NULL}, /* 0xfa */ {SNDRV_SEQ_EVENT_CONTINUE, 0, NULL, NULL}, /* 0xfb */ {SNDRV_SEQ_EVENT_STOP, 0, NULL, NULL}, /* 0xfc */ {SNDRV_SEQ_EVENT_NONE, -1, NULL, NULL}, /* 0xfd */ {SNDRV_SEQ_EVENT_SENSING, 0, NULL, NULL}, /* 0xfe */ {SNDRV_SEQ_EVENT_RESET, 0, NULL, NULL}, /* 0xff */ }; static int extra_decode_ctrl14(struct snd_midi_event *dev, unsigned char *buf, int len, struct snd_seq_event *ev); static int extra_decode_xrpn(struct snd_midi_event *dev, unsigned char *buf, int count, struct snd_seq_event *ev); static struct extra_event_list { int event; int (*decode)(struct snd_midi_event *dev, unsigned char *buf, int len, struct snd_seq_event *ev); } extra_event[] = { {SNDRV_SEQ_EVENT_CONTROL14, extra_decode_ctrl14}, {SNDRV_SEQ_EVENT_NONREGPARAM, extra_decode_xrpn}, {SNDRV_SEQ_EVENT_REGPARAM, extra_decode_xrpn}, }; /* * new/delete record */ int snd_midi_event_new(int bufsize, struct snd_midi_event **rdev) { struct snd_midi_event *dev; *rdev = NULL; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (dev == NULL) return -ENOMEM; if (bufsize > 0) { dev->buf = kmalloc(bufsize, GFP_KERNEL); if (dev->buf == NULL) { kfree(dev); return -ENOMEM; } } dev->bufsize = bufsize; dev->lastcmd = 0xff; dev->type = ST_INVALID; spin_lock_init(&dev->lock); *rdev = dev; return 0; } EXPORT_SYMBOL(snd_midi_event_new); void snd_midi_event_free(struct snd_midi_event *dev) { if (dev != NULL) { kfree(dev->buf); kfree(dev); } } EXPORT_SYMBOL(snd_midi_event_free); /* * initialize record */ static inline void reset_encode(struct snd_midi_event *dev) { dev->read = 0; dev->qlen = 0; dev->type = ST_INVALID; } void snd_midi_event_reset_encode(struct snd_midi_event *dev) { guard(spinlock_irqsave)(&dev->lock); reset_encode(dev); } EXPORT_SYMBOL(snd_midi_event_reset_encode); void snd_midi_event_reset_decode(struct snd_midi_event *dev) { guard(spinlock_irqsave)(&dev->lock); dev->lastcmd = 0xff; } EXPORT_SYMBOL(snd_midi_event_reset_decode); void snd_midi_event_no_status(struct snd_midi_event *dev, int on) { dev->nostat = on ? 1 : 0; } EXPORT_SYMBOL(snd_midi_event_no_status); /* * read one byte and encode to sequencer event: * return true if MIDI bytes are encoded to an event * false data is not finished */ bool snd_midi_event_encode_byte(struct snd_midi_event *dev, unsigned char c, struct snd_seq_event *ev) { bool rc = false; if (c >= MIDI_CMD_COMMON_CLOCK) { /* real-time event */ ev->type = status_event[ST_SPECIAL + c - 0xf0].event; ev->flags &= ~SNDRV_SEQ_EVENT_LENGTH_MASK; ev->flags |= SNDRV_SEQ_EVENT_LENGTH_FIXED; return ev->type != SNDRV_SEQ_EVENT_NONE; } guard(spinlock_irqsave)(&dev->lock); if ((c & 0x80) && (c != MIDI_CMD_COMMON_SYSEX_END || dev->type != ST_SYSEX)) { /* new command */ dev->buf[0] = c; if ((c & 0xf0) == 0xf0) /* system messages */ dev->type = (c & 0x0f) + ST_SPECIAL; else dev->type = (c >> 4) & 0x07; dev->read = 1; dev->qlen = status_event[dev->type].qlen; } else { if (dev->qlen > 0) { /* rest of command */ dev->buf[dev->read++] = c; if (dev->type != ST_SYSEX) dev->qlen--; } else { /* running status */ dev->buf[1] = c; dev->qlen = status_event[dev->type].qlen - 1; dev->read = 2; } } if (dev->qlen == 0) { ev->type = status_event[dev->type].event; ev->flags &= ~SNDRV_SEQ_EVENT_LENGTH_MASK; ev->flags |= SNDRV_SEQ_EVENT_LENGTH_FIXED; if (status_event[dev->type].encode) /* set data values */ status_event[dev->type].encode(dev, ev); if (dev->type >= ST_SPECIAL) dev->type = ST_INVALID; rc = true; } else if (dev->type == ST_SYSEX) { if (c == MIDI_CMD_COMMON_SYSEX_END || dev->read >= dev->bufsize) { ev->flags &= ~SNDRV_SEQ_EVENT_LENGTH_MASK; ev->flags |= SNDRV_SEQ_EVENT_LENGTH_VARIABLE; ev->type = SNDRV_SEQ_EVENT_SYSEX; ev->data.ext.len = dev->read; ev->data.ext.ptr = dev->buf; if (c != MIDI_CMD_COMMON_SYSEX_END) dev->read = 0; /* continue to parse */ else reset_encode(dev); /* all parsed */ rc = true; } } return rc; } EXPORT_SYMBOL(snd_midi_event_encode_byte); /* encode note event */ static void note_event(struct snd_midi_event *dev, struct snd_seq_event *ev) { ev->data.note.channel = dev->buf[0] & 0x0f; ev->data.note.note = dev->buf[1]; ev->data.note.velocity = dev->buf[2]; } /* encode one parameter controls */ static void one_param_ctrl_event(struct snd_midi_event *dev, struct snd_seq_event *ev) { ev->data.control.channel = dev->buf[0] & 0x0f; ev->data.control.value = dev->buf[1]; } /* encode pitch wheel change */ static void pitchbend_ctrl_event(struct snd_midi_event *dev, struct snd_seq_event *ev) { ev->data.control.channel = dev->buf[0] & 0x0f; ev->data.control.value = (int)dev->buf[2] * 128 + (int)dev->buf[1] - 8192; } /* encode midi control change */ static void two_param_ctrl_event(struct snd_midi_event *dev, struct snd_seq_event *ev) { ev->data.control.channel = dev->buf[0] & 0x0f; ev->data.control.param = dev->buf[1]; ev->data.control.value = dev->buf[2]; } /* encode one parameter value*/ static void one_param_event(struct snd_midi_event *dev, struct snd_seq_event *ev) { ev->data.control.value = dev->buf[1]; } /* encode song position */ static void songpos_event(struct snd_midi_event *dev, struct snd_seq_event *ev) { ev->data.control.value = (int)dev->buf[2] * 128 + (int)dev->buf[1]; } /* * decode from a sequencer event to midi bytes * return the size of decoded midi events */ long snd_midi_event_decode(struct snd_midi_event *dev, unsigned char *buf, long count, struct snd_seq_event *ev) { unsigned int cmd, type; if (ev->type == SNDRV_SEQ_EVENT_NONE) return -ENOENT; for (type = 0; type < ARRAY_SIZE(status_event); type++) { if (ev->type == status_event[type].event) goto __found; } for (type = 0; type < ARRAY_SIZE(extra_event); type++) { if (ev->type == extra_event[type].event) return extra_event[type].decode(dev, buf, count, ev); } return -ENOENT; __found: if (type >= ST_SPECIAL) cmd = 0xf0 + (type - ST_SPECIAL); else /* data.note.channel and data.control.channel is identical */ cmd = 0x80 | (type << 4) | (ev->data.note.channel & 0x0f); if (cmd == MIDI_CMD_COMMON_SYSEX) { snd_midi_event_reset_decode(dev); return snd_seq_expand_var_event(ev, count, buf, 1, 0); } else { int qlen; unsigned char xbuf[4]; unsigned long flags; spin_lock_irqsave(&dev->lock, flags); if ((cmd & 0xf0) == 0xf0 || dev->lastcmd != cmd || dev->nostat) { dev->lastcmd = cmd; spin_unlock_irqrestore(&dev->lock, flags); xbuf[0] = cmd; if (status_event[type].decode) status_event[type].decode(ev, xbuf + 1); qlen = status_event[type].qlen + 1; } else { spin_unlock_irqrestore(&dev->lock, flags); if (status_event[type].decode) status_event[type].decode(ev, xbuf + 0); qlen = status_event[type].qlen; } if (count < qlen) return -ENOMEM; memcpy(buf, xbuf, qlen); return qlen; } } EXPORT_SYMBOL(snd_midi_event_decode); /* decode note event */ static void note_decode(struct snd_seq_event *ev, unsigned char *buf) { buf[0] = ev->data.note.note & 0x7f; buf[1] = ev->data.note.velocity & 0x7f; } /* decode one parameter controls */ static void one_param_decode(struct snd_seq_event *ev, unsigned char *buf) { buf[0] = ev->data.control.value & 0x7f; } /* decode pitch wheel change */ static void pitchbend_decode(struct snd_seq_event *ev, unsigned char *buf) { int value = ev->data.control.value + 8192; buf[0] = value & 0x7f; buf[1] = (value >> 7) & 0x7f; } /* decode midi control change */ static void two_param_decode(struct snd_seq_event *ev, unsigned char *buf) { buf[0] = ev->data.control.param & 0x7f; buf[1] = ev->data.control.value & 0x7f; } /* decode song position */ static void songpos_decode(struct snd_seq_event *ev, unsigned char *buf) { buf[0] = ev->data.control.value & 0x7f; buf[1] = (ev->data.control.value >> 7) & 0x7f; } /* decode 14bit control */ static int extra_decode_ctrl14(struct snd_midi_event *dev, unsigned char *buf, int count, struct snd_seq_event *ev) { unsigned char cmd; int idx = 0; cmd = MIDI_CMD_CONTROL|(ev->data.control.channel & 0x0f); if (ev->data.control.param < 0x20) { if (count < 4) return -ENOMEM; if (dev->nostat && count < 6) return -ENOMEM; if (cmd != dev->lastcmd || dev->nostat) { if (count < 5) return -ENOMEM; buf[idx++] = dev->lastcmd = cmd; } buf[idx++] = ev->data.control.param; buf[idx++] = (ev->data.control.value >> 7) & 0x7f; if (dev->nostat) buf[idx++] = cmd; buf[idx++] = ev->data.control.param + 0x20; buf[idx++] = ev->data.control.value & 0x7f; } else { if (count < 2) return -ENOMEM; if (cmd != dev->lastcmd || dev->nostat) { if (count < 3) return -ENOMEM; buf[idx++] = dev->lastcmd = cmd; } buf[idx++] = ev->data.control.param & 0x7f; buf[idx++] = ev->data.control.value & 0x7f; } return idx; } /* decode reg/nonreg param */ static int extra_decode_xrpn(struct snd_midi_event *dev, unsigned char *buf, int count, struct snd_seq_event *ev) { unsigned char cmd; const char *cbytes; static const char cbytes_nrpn[4] = { MIDI_CTL_NONREG_PARM_NUM_MSB, MIDI_CTL_NONREG_PARM_NUM_LSB, MIDI_CTL_MSB_DATA_ENTRY, MIDI_CTL_LSB_DATA_ENTRY }; static const char cbytes_rpn[4] = { MIDI_CTL_REGIST_PARM_NUM_MSB, MIDI_CTL_REGIST_PARM_NUM_LSB, MIDI_CTL_MSB_DATA_ENTRY, MIDI_CTL_LSB_DATA_ENTRY }; unsigned char bytes[4]; int idx = 0, i; if (count < 8) return -ENOMEM; if (dev->nostat && count < 12) return -ENOMEM; cmd = MIDI_CMD_CONTROL|(ev->data.control.channel & 0x0f); bytes[0] = (ev->data.control.param & 0x3f80) >> 7; bytes[1] = ev->data.control.param & 0x007f; bytes[2] = (ev->data.control.value & 0x3f80) >> 7; bytes[3] = ev->data.control.value & 0x007f; if (cmd != dev->lastcmd && !dev->nostat) { if (count < 9) return -ENOMEM; buf[idx++] = dev->lastcmd = cmd; } cbytes = ev->type == SNDRV_SEQ_EVENT_NONREGPARAM ? cbytes_nrpn : cbytes_rpn; for (i = 0; i < 4; i++) { if (dev->nostat) buf[idx++] = dev->lastcmd = cmd; buf[idx++] = cbytes[i]; buf[idx++] = bytes[i]; } return idx; }
1 1 1 3 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 #ifndef IOU_ALLOC_CACHE_H #define IOU_ALLOC_CACHE_H #include <linux/io_uring_types.h> /* * Don't allow the cache to grow beyond this size. */ #define IO_ALLOC_CACHE_MAX 128 void io_alloc_cache_free(struct io_alloc_cache *cache, void (*free)(const void *)); bool io_alloc_cache_init(struct io_alloc_cache *cache, unsigned max_nr, unsigned int size, unsigned int init_bytes); void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp); static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, void *entry) { if (cache->nr_cached < cache->max_cached) { if (!kasan_mempool_poison_object(entry)) return false; cache->entries[cache->nr_cached++] = entry; return true; } return false; } static inline void *io_alloc_cache_get(struct io_alloc_cache *cache) { if (cache->nr_cached) { void *entry = cache->entries[--cache->nr_cached]; /* * If KASAN is enabled, always clear the initial bytes that * must be zeroed post alloc, in case any of them overlap * with KASAN storage. */ #if defined(CONFIG_KASAN) kasan_mempool_unpoison_object(entry, cache->elem_size); if (cache->init_clear) memset(entry, 0, cache->init_clear); #endif return entry; } return NULL; } static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp) { void *obj; obj = io_alloc_cache_get(cache); if (obj) return obj; return io_cache_alloc_new(cache, gfp); } static inline void io_cache_free(struct io_alloc_cache *cache, void *obj) { if (!io_alloc_cache_put(cache, obj)) kfree(obj); } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * configfs_internal.h - Internal stuff for configfs * * Based on sysfs: * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel * * configfs Copyright (C) 2005 Oracle. All rights reserved. */ #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> struct configfs_fragment { atomic_t frag_count; struct rw_semaphore frag_sem; bool frag_dead; }; void put_fragment(struct configfs_fragment *); struct configfs_fragment *get_fragment(struct configfs_fragment *); struct configfs_dirent { atomic_t s_count; int s_dependent_count; struct list_head s_sibling; struct list_head s_children; int s_links; void * s_element; int s_type; umode_t s_mode; struct dentry * s_dentry; struct iattr * s_iattr; #ifdef CONFIG_LOCKDEP int s_depth; #endif struct configfs_fragment *s_frag; }; #define CONFIGFS_ROOT 0x0001 #define CONFIGFS_DIR 0x0002 #define CONFIGFS_ITEM_ATTR 0x0004 #define CONFIGFS_ITEM_BIN_ATTR 0x0008 #define CONFIGFS_ITEM_LINK 0x0020 #define CONFIGFS_USET_DIR 0x0040 #define CONFIGFS_USET_DEFAULT 0x0080 #define CONFIGFS_USET_DROPPING 0x0100 #define CONFIGFS_USET_IN_MKDIR 0x0200 #define CONFIGFS_USET_CREATING 0x0400 #define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR) #define CONFIGFS_PINNED \ (CONFIGFS_ROOT | CONFIGFS_DIR | CONFIGFS_ITEM_LINK) extern struct mutex configfs_symlink_mutex; extern spinlock_t configfs_dirent_lock; extern struct kmem_cache *configfs_dir_cachep; extern int configfs_is_root(struct config_item *item); extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *); extern struct inode *configfs_create(struct dentry *, umode_t mode); extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); extern int configfs_create_bin_file(struct config_item *, const struct configfs_bin_attribute *); extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *, void *, umode_t, int, struct configfs_fragment *); extern int configfs_dirent_is_ready(struct configfs_dirent *); extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); extern int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); extern struct dentry *configfs_pin_fs(void); extern void configfs_release_fs(void); extern const struct file_operations configfs_dir_operations; extern const struct file_operations configfs_file_operations; extern const struct file_operations configfs_bin_file_operations; extern const struct inode_operations configfs_dir_inode_operations; extern const struct inode_operations configfs_root_inode_operations; extern const struct inode_operations configfs_symlink_inode_operations; extern const struct dentry_operations configfs_dentry_ops; extern int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname); extern int configfs_unlink(struct inode *dir, struct dentry *dentry); int configfs_create_link(struct configfs_dirent *target, struct dentry *parent, struct dentry *dentry, char *body); static inline struct config_item * to_item(struct dentry * dentry) { struct configfs_dirent * sd = dentry->d_fsdata; return ((struct config_item *) sd->s_element); } static inline struct configfs_attribute * to_attr(struct dentry * dentry) { struct configfs_dirent * sd = dentry->d_fsdata; return ((struct configfs_attribute *) sd->s_element); } static inline struct configfs_bin_attribute *to_bin_attr(struct dentry *dentry) { struct configfs_attribute *attr = to_attr(dentry); return container_of(attr, struct configfs_bin_attribute, cb_attr); } static inline struct config_item *configfs_get_config_item(struct dentry *dentry) { struct config_item * item = NULL; spin_lock(&dentry->d_lock); if (!d_unhashed(dentry)) { struct configfs_dirent * sd = dentry->d_fsdata; item = config_item_get(sd->s_element); } spin_unlock(&dentry->d_lock); return item; } static inline void release_configfs_dirent(struct configfs_dirent * sd) { if (!(sd->s_type & CONFIGFS_ROOT)) { kfree(sd->s_iattr); put_fragment(sd->s_frag); kmem_cache_free(configfs_dir_cachep, sd); } } static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd) { if (sd) { WARN_ON(!atomic_read(&sd->s_count)); atomic_inc(&sd->s_count); } return sd; } static inline void configfs_put(struct configfs_dirent * sd) { WARN_ON(!atomic_read(&sd->s_count)); if (atomic_dec_and_test(&sd->s_count)) release_configfs_dirent(sd); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/signalfd.h * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * */ #ifndef _LINUX_SIGNALFD_H #define _LINUX_SIGNALFD_H #include <uapi/linux/signalfd.h> #include <linux/sched/signal.h> #ifdef CONFIG_SIGNALFD /* * Deliver the signal to listening signalfd. */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { if (unlikely(waitqueue_active(&tsk->sighand->signalfd_wqh))) wake_up(&tsk->sighand->signalfd_wqh); } extern void signalfd_cleanup(struct sighand_struct *sighand); #else /* CONFIG_SIGNALFD */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { } static inline void signalfd_cleanup(struct sighand_struct *sighand) { } #endif /* CONFIG_SIGNALFD */ #endif /* _LINUX_SIGNALFD_H */
540 536 2 2 2 1 3 3 1 1 1 1 1 1 1 4 3 4 1 3 3 3 4 3 2 4 2 4 4 4 4 4 12 11 8 3 1 2 2 1 1 2 1 1 4 11 11 12 4 4 11 11 12 12 6 5 4 3 3 3 4 3 2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/exit.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/stat.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/capability.h> #include <linux/completion.h> #include <linux/personality.h> #include <linux/tty.h> #include <linux/iocontext.h> #include <linux/key.h> #include <linux/cpu.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/kthread.h> #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> #include <linux/cgroup.h> #include <linux/syscalls.h> #include <linux/signal.h> #include <linux/posix-timers.h> #include <linux/cn_proc.h> #include <linux/mutex.h> #include <linux/futex.h> #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> #include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/task_work.h> #include <linux/fs_struct.h> #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> #include <linux/hw_breakpoint.h> #include <linux/oom.h> #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> #include <linux/kmsan.h> #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/kprobes.h> #include <linux/rethook.h> #include <linux/sysfs.h> #include <linux/user_events.h> #include <linux/unwind_deferred.h> #include <linux/uaccess.h> #include <linux/pidfs.h> #include <uapi/linux/wait.h> #include <asm/unistd.h> #include <asm/mmu_context.h> #include "exit.h" /* * The default value should be high enough to not crash a system that randomly * crashes its kernel from time to time, but low enough to at least not permit * overflowing 32-bit refcounts or the ldsem writer count. */ static unsigned int oops_limit = 10000; #ifdef CONFIG_SYSCTL static const struct ctl_table kern_exit_table[] = { { .procname = "oops_limit", .data = &oops_limit, .maxlen = sizeof(oops_limit), .mode = 0644, .proc_handler = proc_douintvec, }, }; static __init int kernel_exit_sysctls_init(void) { register_sysctl_init("kernel", kern_exit_table); return 0; } late_initcall(kernel_exit_sysctls_init); #endif static atomic_t oops_count = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); } static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); static __init int kernel_exit_sysfs_init(void) { sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); return 0; } late_initcall(kernel_exit_sysfs_init); #endif /* * For things release_task() would like to do *after* tasklist_lock is released. */ struct release_task_post { struct pid *pids[PIDTYPE_MAX]; }; static void __unhash_process(struct release_task_post *post, struct task_struct *p, bool group_dead) { struct pid *pid = task_pid(p); nr_threads--; detach_pid(post->pids, p, PIDTYPE_PID); wake_up_all(&pid->wait_pidfd); if (group_dead) { detach_pid(post->pids, p, PIDTYPE_TGID); detach_pid(post->pids, p, PIDTYPE_PGID); detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_node); } /* * This function expects the tasklist_lock write-locked. */ static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *tty; u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); #ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); if (group_dead) posix_cpu_timers_exit_group(tsk); #endif if (group_dead) { tty = sig->tty; sig->tty = NULL; } else { /* * If there is any task waiting for the group exit * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) wake_up_process(sig->group_exec_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); } /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, * but we want to avoid the race with thread_group_cputime() which can * see the empty ->thread_head list. */ task_cputime(tsk, &utime, &stime); write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(post, tsk, group_dead); write_sequnlock(&sig->stats_lock); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); if (group_dead) tty_kref_put(tty); } static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); kprobe_flush_task(tsk); rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } void put_task_struct_rcu_user(struct task_struct *task) { if (refcount_dec_and_test(&task->rcu_users)) call_rcu(&task->rcu, delayed_put_task_struct); } void __weak release_thread(struct task_struct *dead_task) { } void release_task(struct task_struct *p) { struct release_task_post post; struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: memset(&post, 0, sizeof(post)); /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. */ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); pidfs_exit(p); cgroup_task_release(p); /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ thread_pid = task_pid(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); __exit_signal(&post, p); /* * If we are the last non-leader member of the thread * group, and the leader is zombie, then notify the * group leader's parent process. (if it wants notification.) */ zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { /* for pidfs_exit() and do_notify_parent() */ if (leader->signal->flags & SIGNAL_GROUP_EXIT) leader->exit_code = leader->signal->group_exit_code; /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, * then we are the one who should release the leader. */ zap_leader = do_notify_parent(leader, leader->exit_signal); if (zap_leader) leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); exit_cred_namespaces(p); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); release_thread(p); /* * This task was already removed from the process/thread/pid lists * and lock_task_sighand(p) can't succeed. Nobody else can touch * ->pending or, if group dead, signal->shared_pending. We can call * flush_sigqueue() lockless. */ flush_sigqueue(&p->pending); if (thread_group_leader(p)) flush_sigqueue(&p->signal->shared_pending); put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) goto repeat; } int rcuwait_wake_up(struct rcuwait *w) { int ret = 0; struct task_struct *task; rcu_read_lock(); /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called * rcuwait_wake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE * [S] tsk = current [S] cond = true * MB (A) MB (B) * [L] cond [L] tsk */ smp_mb(); /* (B) */ task = rcu_dereference(w->task); if (task) ret = wake_up_process(task); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are * to receive a SIGHUP and a SIGCONT. * * "I ask you, have you ever known what it is to be an orphan?" */ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if ((p == ignored_task) || (p->exit_state && thread_group_empty(p)) || is_global_init(p->real_parent)) continue; if (task_pgrp(p->real_parent) != pgrp && task_session(p->real_parent) == task_session(p)) return 0; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return 1; } int is_current_pgrp_orphaned(void) { int retval; read_lock(&tasklist_lock); retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); read_unlock(&tasklist_lock); return retval; } static bool has_stopped_jobs(struct pid *pgrp) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p->signal->flags & SIGNAL_STOP_STOPPED) return true; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return false; } /* * Check to see if any process groups have become orphaned as * a result of our exiting, and if they have any stopped jobs, * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) { struct pid *pgrp = task_pgrp(tsk); struct task_struct *ignored_task = tsk; if (!parent) /* exit: our father is in a different pgrp than * we are and we were the only connection outside. */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than * we are, and it was the only connection outside. */ ignored_task = NULL; if (task_pgrp(parent) != pgrp && task_session(parent) == task_session(tsk) && will_become_orphaned_pgrp(pgrp, ignored_task) && has_stopped_jobs(pgrp)) { __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } static void coredump_task_exit(struct task_struct *tsk, struct core_state *core_state) { struct core_thread self; self.task = tsk; if (self.task->flags & PF_SIGNALED) self.next = xchg(&core_state->dumper.next, &self); else self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. */ if (atomic_dec_and_test(&core_state->nr_threads)) complete(&core_state->startup); for (;;) { set_current_state(TASK_IDLE|TASK_FREEZABLE); if (!self.task) /* see coredump_finish() */ break; schedule(); } __set_current_state(TASK_RUNNING); } #ifdef CONFIG_MEMCG /* drops tasklist_lock if succeeds */ static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) { bool ret = false; task_lock(tsk); if (likely(tsk->mm == mm)) { /* tsk can't pass exit_mm/exec_mmap and exit */ read_unlock(&tasklist_lock); WRITE_ONCE(mm->owner, tsk); lru_gen_migrate_mm(mm); ret = true; } task_unlock(tsk); return ret; } static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm) { struct task_struct *t; for_each_thread(g, t) { struct mm_struct *t_mm = READ_ONCE(t->mm); if (t_mm == mm) { if (__try_to_set_owner(t, mm)) return true; } else if (t_mm) break; } return false; } /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { struct task_struct *g, *p = current; /* * If the exiting or execing task is not the owner, it's * someone else's problem. */ if (mm->owner != p) return; /* * The current owner is exiting/execing and there are no other * candidates. Do not leave the mm pointing to a possibly * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { WRITE_ONCE(mm->owner, NULL); return; } read_lock(&tasklist_lock); /* * Search in the children */ list_for_each_entry(g, &p->children, sibling) { if (try_to_set_owner(g, mm)) goto ret; } /* * Search in the siblings */ list_for_each_entry(g, &p->real_parent->children, sibling) { if (try_to_set_owner(g, mm)) goto ret; } /* * Search through everything else, we should not get here often. */ for_each_process(g) { if (atomic_read(&mm->mm_users) <= 1) break; if (g->flags & PF_KTHREAD) continue; if (try_to_set_owner(g, mm)) goto ret; } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ WRITE_ONCE(mm->owner, NULL); ret: return; } #endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we * aren't already.. */ static void exit_mm(void) { struct mm_struct *mm = current->mm; exit_mm_release(current, mm); if (!mm) return; mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); /* * When a thread stops operating on an address space, the loop * in membarrier_private_expedited() may not observe that * tsk->mm, and the loop in membarrier_global_expedited() may * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED * rq->membarrier_state, so those would not issue an IPI. * Membarrier requires a memory barrier after accessing * user-space memory, before clearing tsk->mm or the * rq->membarrier_state. */ smp_mb__after_spinlock(); local_irq_disable(); current->mm = NULL; membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); local_irq_enable(); task_unlock(current); mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) { struct task_struct *t; for_each_thread(p, t) { if (!(t->flags & PF_EXITING)) return t; } return NULL; } static struct task_struct *find_child_reaper(struct task_struct *father, struct list_head *dead) __releases(&tasklist_lock) __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *reaper = pid_ns->child_reaper; struct task_struct *p, *n; if (likely(reaper != father)) return reaper; reaper = find_alive_thread(father); if (reaper) { pid_ns->child_reaper = reaper; return reaper; } write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); return father; } /* * When we die, we re-parent all our children, and try to: * 1. give them to another thread in our thread group, if such a member exists * 2. give it to the first ancestor process which prctl'd itself as a * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ static struct task_struct *find_new_reaper(struct task_struct *father, struct task_struct *child_reaper) { struct task_struct *thread, *reaper; thread = find_alive_thread(father); if (thread) return thread; if (father->signal->has_child_subreaper) { unsigned int ns_level = task_pid(father)->level; /* * Find the first ->is_child_subreaper ancestor in our pid_ns. * We can't check reaper != child_reaper to ensure we do not * cross the namespaces, the exiting parent could be injected * by setns() + fork(). * We check pid->level, this is slightly more efficient than * task_active_pid_ns(reaper) != task_active_pid_ns(father). */ for (reaper = father->real_parent; task_pid(reaper)->level == ns_level; reaper = reaper->real_parent) { if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; thread = find_alive_thread(reaper); if (thread) return thread; } } return child_reaper; } /* * Any that need to be release_task'd are put on the @dead list. */ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { if (unlikely(p->exit_state == EXIT_DEAD)) return; /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; list_add(&p->ptrace_entry, dead); } } kill_orphaned_pgrp(p, father); } /* * Make init inherit all the child processes */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) { struct task_struct *p, *t, *reaper; if (unlikely(!list_empty(&father->ptraced))) exit_ptrace(father, dead); /* Can drop and reacquire tasklist_lock */ reaper = find_child_reaper(father, dead); if (list_empty(&father->children)) return; reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { RCU_INIT_POINTER(t->real_parent, reaper); BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t, PIDTYPE_TGID); } /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. */ if (!same_thread_group(reaper, father)) reparent_leader(father, p, dead); } list_splice_tail_init(&father->children, &reaper->children); } /* * Send signals to all our closest relatives so that they know * to properly mourn us.. */ static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; struct task_struct *p, *n; LIST_HEAD(dead); write_lock_irq(&tasklist_lock); forget_original_parent(tsk, &dead); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && !ptrace_reparented(tsk) ? tsk->exit_signal : SIGCHLD; autoreap = do_notify_parent(tsk, sig); } else if (thread_group_leader(tsk)) { autoreap = thread_group_empty(tsk) && do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; /* untraced sub-thread */ do_notify_pidfd(tsk); } if (autoreap) { tsk->exit_state = EXIT_DEAD; list_add(&tsk->ptrace_entry, &dead); } /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } } #ifdef CONFIG_DEBUG_STACK_USAGE #ifdef CONFIG_STACK_GROWSUP unsigned long stack_not_used(struct task_struct *p) { unsigned long *n = end_of_stack(p); do { /* Skip over canary */ n--; } while (!*n); return (unsigned long)end_of_stack(p) - (unsigned long)n; } #else /* !CONFIG_STACK_GROWSUP */ unsigned long stack_not_used(struct task_struct *p) { unsigned long *n = end_of_stack(p); do { /* Skip over canary */ n++; } while (!*n); return (unsigned long)n - (unsigned long)end_of_stack(p); } #endif /* CONFIG_STACK_GROWSUP */ /* Count the maximum pages reached in kernel stacks */ static inline void kstack_histogram(unsigned long used_stack) { #ifdef CONFIG_VM_EVENT_COUNTERS if (used_stack <= 1024) count_vm_event(KSTACK_1K); #if THREAD_SIZE > 1024 else if (used_stack <= 2048) count_vm_event(KSTACK_2K); #endif #if THREAD_SIZE > 2048 else if (used_stack <= 4096) count_vm_event(KSTACK_4K); #endif #if THREAD_SIZE > 4096 else if (used_stack <= 8192) count_vm_event(KSTACK_8K); #endif #if THREAD_SIZE > 8192 else if (used_stack <= 16384) count_vm_event(KSTACK_16K); #endif #if THREAD_SIZE > 16384 else if (used_stack <= 32768) count_vm_event(KSTACK_32K); #endif #if THREAD_SIZE > 32768 else if (used_stack <= 65536) count_vm_event(KSTACK_64K); #endif #if THREAD_SIZE > 65536 else count_vm_event(KSTACK_REST); #endif #endif /* CONFIG_VM_EVENT_COUNTERS */ } static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; unsigned long free; free = stack_not_used(current); kstack_histogram(THREAD_SIZE - free); if (free >= lowest_to_date) return; spin_lock(&low_water_lock); if (free < lowest_to_date) { pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); } #else /* !CONFIG_DEBUG_STACK_USAGE */ static inline void check_stack_usage(void) {} #endif /* CONFIG_DEBUG_STACK_USAGE */ static void synchronize_group_exit(struct task_struct *tsk, long code) { struct sighand_struct *sighand = tsk->sighand; struct signal_struct *signal = tsk->signal; struct core_state *core_state; spin_lock_irq(&sighand->siglock); signal->quick_threads--; if ((signal->quick_threads == 0) && !(signal->flags & SIGNAL_GROUP_EXIT)) { signal->flags = SIGNAL_GROUP_EXIT; signal->group_exit_code = code; signal->group_stop_count = 0; } /* * Serialize with any possible pending coredump. * We must hold siglock around checking core_state * and setting PF_POSTCOREDUMP. The core-inducing thread * will increment ->nr_threads for each thread in the * group without PF_POSTCOREDUMP set. */ tsk->flags |= PF_POSTCOREDUMP; core_state = signal->core_state; spin_unlock_irq(&sighand->siglock); if (unlikely(core_state)) coredump_task_exit(tsk, core_state); } void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; WARN_ON(irqs_disabled()); WARN_ON(tsk->plug); kcov_task_exit(tsk); kmsan_task_exit(tsk); synchronize_group_exit(tsk, code); ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); io_uring_files_cancel(); sched_mm_cid_exit(tsk); exit_signals(tsk); /* sets PF_EXITING */ seccomp_filter_release(tsk); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { /* * If the last thread of global init has exited, panic * immediately to get a useable coredump. */ if (unlikely(is_global_init(tsk))) panic("Attempted to kill init! exitcode=0x%08x\n", tsk->signal->group_exit_code ?: (int)code); #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); trace_sched_process_exit(tsk, group_dead); /* * Since sampling can touch ->mm, make sure to stop everything before we * tear it down. * * Also flushes inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ perf_event_exit_task(tsk); /* * PF_EXITING (above) ensures unwind_deferred_request() will no * longer add new unwinds. While exit_mm() (below) will destroy the * abaility to do unwinds. So flush any pending unwinds here. */ unwind_deferred_task_exit(tsk); exit_mm(); if (group_dead) acct_process(); exit_sem(tsk); exit_shm(tsk); exit_files(tsk); exit_fs(tsk); if (group_dead) disassociate_ctty(1); exit_nsproxy_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); sched_autogroup_exit_task(tsk); cgroup_task_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif /* * Make sure we are holding no locks: */ debug_check_no_locks_held(); if (tsk->io_context) exit_io_context(tsk); if (tsk->splice_pipe) free_pipe_info(tsk->splice_pipe); if (tsk->task_frag.page) put_page(tsk->task_frag.page); exit_task_stack_account(tsk); check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); exit_tasks_rcu_finish(); lockdep_free_task(tsk); do_task_dead(); } void __noreturn make_task_dead(int signr) { /* * Take the task off the cpu after something catastrophic has * happened. * * We can get here from a kernel oops, sometimes with preemption off. * Start by checking for critical errors. * Then fix up important state like USER_DS and preemption. * Then do everything else. */ struct task_struct *tsk = current; unsigned int limit; if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); if (unlikely(irqs_disabled())) { pr_info("note: %s[%d] exited with irqs disabled\n", current->comm, task_pid_nr(current)); local_irq_enable(); } if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); preempt_count_set(PREEMPT_ENABLED); } /* * Every time the system oopses, if the oops happens while a reference * to an object was held, the reference leaks. * If the oops doesn't also leak memory, repeated oopsing can cause * reference counters to wrap around (if they're not using refcount_t). * This means that repeated oopsing can make unexploitable-looking bugs * exploitable through repeated oopsing. * To make sure this can't happen, place an upper bound on how often the * kernel may oops without panic(). */ limit = READ_ONCE(oops_limit); if (atomic_inc_return(&oops_count) >= limit && limit) panic("Oopsed too often (kernel.oops_limit is %d)", limit); /* * We're taking recursive faults here in make_task_dead. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); futex_exit_recursive(tsk); tsk->exit_state = EXIT_DEAD; refcount_inc(&tsk->rcu_users); do_task_dead(); } do_exit(signr); } SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } /* * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ void __noreturn do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; zap_other_threads(current); } spin_unlock_irq(&sighand->siglock); } do_exit(exit_code); /* NOTREACHED */ } /* * this kills every thread in the thread group. Note that any externally * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); /* NOTREACHED */ return 0; } static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || task_pid_type(p, wo->wo_type) == wo->wo_pid; } static int eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; /* * Wait for all children (clone and not) if __WALL is set or * if it is traced by us. */ if (ptrace || (wo->wo_flags & __WALL)) return 1; /* * Otherwise, wait for clone children *only* if __WCLONE is set; * otherwise, wait for non-clone children *only*. * * Note: a "clone" child here is one that reports to its parent * using a signal other than SIGCHLD, or a non-leader thread which * we can only see if it is traced by us. */ if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; } /* * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { int state, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct waitid_info *infop; if (!likely(wo->wo_flags & WEXITED)) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); goto out_info; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ state = (ptrace_reparented(p) && thread_group_leader(p)) ? EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; /* * We own this thread, nobody else can reap it. */ read_unlock(&tasklist_lock); sched_annotate_sleep(); /* * Check thread_group_leader() to exclude the traced sub-threads. */ if (state == EXIT_DEAD && thread_group_leader(p)) { struct signal_struct *sig = p->signal; struct signal_struct *psig = current->signal; unsigned long maxrss; u64 tgutime, tgstime; /* * The resource counters for the group leader are in its * own task_struct. Those for dead threads in the group * are in its signal_struct, as are those for the child * processes it has previously reaped. All these * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these * p->signal fields because the whole thread group is dead * and nobody can change them. * * psig->stats_lock also protects us from our sub-threads * which can reap other children at the same time. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); write_seqlock_irq(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += p->maj_flt + sig->maj_flt + sig->cmaj_flt; psig->cnvcsw += p->nvcsw + sig->nvcsw + sig->cnvcsw; psig->cnivcsw += p->nivcsw + sig->nivcsw + sig->cnivcsw; psig->cinblock += task_io_get_inblock(p) + sig->inblock + sig->cinblock; psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; maxrss = max(sig->maxrss, sig->cmaxrss); if (psig->cmaxrss < maxrss) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); write_sequnlock_irq(&psig->stats_lock); } if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); /* If parent wants a zombie, don't release it now */ state = EXIT_ZOMBIE; if (do_notify_parent(p, p->exit_signal)) state = EXIT_DEAD; p->exit_state = state; write_unlock_irq(&tasklist_lock); } if (state == EXIT_DEAD) release_task(p); out_info: infop = wo->wo_info; if (infop) { if ((status & 0x7f) == 0) { infop->cause = CLD_EXITED; infop->status = status >> 8; } else { infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; infop->status = status & 0x7f; } infop->pid = pid; infop->uid = uid; } return pid; } static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) return &p->signal->group_exit_code; } return NULL; } /** * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED * @wo: wait options * @ptrace: is the wait for ptrace * @p: task to wait for * * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. * * CONTEXT: * read_lock(&tasklist_lock), which is released if return value is * non-zero. Also, grabs and releases @p->sighand->siglock. * * RETURNS: * 0 if wait condition didn't exist and search for other wait conditions * should continue. Non-zero return, -errno on failure and @p's pid on * success, implies that tasklist_lock is released and wait condition * search should terminate. */ static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) { struct waitid_info *infop; int exit_code, *p_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; /* * Traditionally we see ptrace'd stopped tasks regardless of options. */ if (!ptrace && !(wo->wo_flags & WUNTRACED)) return 0; if (!task_stopped_code(p, ptrace)) return 0; exit_code = 0; spin_lock_irq(&p->sighand->siglock); p_code = task_stopped_code(p, ptrace); if (unlikely(!p_code)) goto unlock_sig; exit_code = *p_code; if (!exit_code) goto unlock_sig; if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; uid = from_kuid_munged(current_user_ns(), task_uid(p)); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) return 0; /* * Now we are pretty sure this task is interesting. * Make sure it doesn't get reaped out from under us while we * give up the lock and then examine it below. We don't want to * keep holding onto the tasklist_lock while we call getrusage and * possibly take page faults for user memory. */ get_task_struct(p); pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); if (likely(!(wo->wo_flags & WNOWAIT))) wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (infop) { infop->cause = why; infop->status = exit_code; infop->pid = pid; infop->uid = uid; } return pid; } /* * Handle do_wait work for one task in a live, non-stopped state. * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) { struct waitid_info *infop; pid_t pid; uid_t uid; if (!unlikely(wo->wo_flags & WCONTINUED)) return 0; if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; spin_lock_irq(&p->sighand->siglock); /* Re-check with the lock held. */ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { spin_unlock_irq(&p->sighand->siglock); return 0; } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; uid = from_kuid_munged(current_user_ns(), task_uid(p)); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); infop = wo->wo_info; if (!infop) { wo->wo_stat = 0xffff; } else { infop->cause = CLD_CONTINUED; infop->pid = pid; infop->uid = uid; infop->status = SIGCONT; } return pid; } /* * Consider @p for a wait by @parent. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; * then ->notask_error is 0 if @p is an eligible child, * or still -ECHILD. */ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { /* * We can race with wait_task_zombie() from another thread. * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition * can't confuse the checks below. */ int exit_state = READ_ONCE(p->exit_state); int ret; if (unlikely(exit_state == EXIT_DEAD)) return 0; ret = eligible_child(wo, ptrace, p); if (!ret) return ret; if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case * we should clear notask_error, debugger will notify us. */ if (likely(!ptrace)) wo->notask_error = 0; return 0; } if (likely(!ptrace) && unlikely(p->ptrace)) { /* * If it is traced by its real parent's group, just pretend * the caller is ptrace_do_wait() and reap this child if it * is zombie. * * This also hides group stop state from real parent; otherwise * a single stop can be reported twice as group and ptrace stop. * If a ptracer wants to distinguish these two events for its * own children it should create a separate process which takes * the role of real parent. */ if (!ptrace_reparented(p)) ptrace = 1; } /* slay zombie? */ if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) { /* * A zombie ptracee is only visible to its ptracer. * Notification and reaping will be cascaded to the * real parent when the ptracer detaches. */ if (unlikely(ptrace) || likely(!p->ptrace)) return wait_task_zombie(wo, p); } /* * Allow access to stopped/continued state via zombie by * falling through. Clearing of notask_error is complex. * * When !@ptrace: * * If WEXITED is set, notask_error should naturally be * cleared. If not, subset of WSTOPPED|WCONTINUED is set, * so, if there are live subthreads, there are events to * wait for. If all subthreads are dead, it's still safe * to clear - this function will be called again in finite * amount time once all the subthreads are released and * will then return without clearing. * * When @ptrace: * * Stopped state is per-task and thus can't change once the * target task dies. Only continued and exited can happen. * Clear notask_error if WCONTINUED | WEXITED. */ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) wo->notask_error = 0; } else { /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ wo->notask_error = 0; } /* * Wait for stopped. Depending on @ptrace, different stopped state * is used and the two don't interact with each other. */ ret = wait_task_stopped(wo, ptrace, p); if (ret) return ret; /* * Wait for continued. There's only one continued state and the * ptracer can consume it which can confuse the real parent. Don't * use WCONTINUED from ptracer. You don't need or want it. */ return wait_task_continued(wo, p); } /* * Do the work of do_wait() for one thread in the group, @tsk. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then * ->notask_error is 0 if there were any eligible children, * or still -ECHILD. */ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } return 0; } static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } return 0; } bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p) { if (!eligible_pid(wo, p)) return false; if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent) return false; return true; } static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct task_struct *p = key; if (pid_child_should_wake(wo, p)) return default_wake_function(wait, mode, sync, key); return 0; } void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { __wake_up_sync_key(&parent->signal->wait_chldexit, TASK_INTERRUPTIBLE, p); } static bool is_effectively_child(struct wait_opts *wo, bool ptrace, struct task_struct *target) { struct task_struct *parent = !ptrace ? target->real_parent : target->parent; return current == parent || (!(wo->wo_flags & __WNOTHREAD) && same_thread_group(current, parent)); } /* * Optimization for waiting on PIDTYPE_PID. No need to iterate through child * and tracee lists to find the target task. */ static int do_wait_pid(struct wait_opts *wo) { bool ptrace; struct task_struct *target; int retval; ptrace = false; target = pid_task(wo->wo_pid, PIDTYPE_TGID); if (target && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } ptrace = true; target = pid_task(wo->wo_pid, PIDTYPE_PID); if (target && target->ptrace && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } return 0; } long __do_wait(struct wait_opts *wo) { long retval; /* * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that * might later match our criteria, even if we are not able to reap * it yet. */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; read_lock(&tasklist_lock); if (wo->wo_type == PIDTYPE_PID) { retval = do_wait_pid(wo); if (retval) return retval; } else { struct task_struct *tsk = current; do { retval = do_wait_thread(wo, tsk); if (retval) return retval; retval = ptrace_do_wait(wo, tsk); if (retval) return retval; if (wo->wo_flags & __WNOTHREAD) break; } while_each_thread(current, tsk); } read_unlock(&tasklist_lock); notask: retval = wo->notask_error; if (!retval && !(wo->wo_flags & WNOHANG)) return -ERESTARTSYS; return retval; } static long do_wait(struct wait_opts *wo) { int retval; trace_sched_process_wait(wo->wo_pid); init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); wo->child_wait.private = current; add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); do { set_current_state(TASK_INTERRUPTIBLE); retval = __do_wait(wo); if (retval != -ERESTARTSYS) break; if (signal_pending(current)) break; schedule(); } while (1); __set_current_state(TASK_RUNNING); remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); return retval; } int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { unsigned int f_flags = 0; struct pid *pid = NULL; enum pid_type type; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) return -EINVAL; switch (which) { case P_ALL: type = PIDTYPE_MAX; break; case P_PID: type = PIDTYPE_PID; if (upid <= 0) return -EINVAL; pid = find_get_pid(upid); break; case P_PGID: type = PIDTYPE_PGID; if (upid < 0) return -EINVAL; if (upid) pid = find_get_pid(upid); else pid = get_task_pid(current, PIDTYPE_PGID); break; case P_PIDFD: type = PIDTYPE_PID; if (upid < 0) return -EINVAL; pid = pidfd_get_pid(upid, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); break; default: return -EINVAL; } wo->wo_type = type; wo->wo_pid = pid; wo->wo_flags = options; wo->wo_info = infop; wo->wo_rusage = ru; if (f_flags & O_NONBLOCK) wo->wo_flags |= WNOHANG; return 0; } static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { struct wait_opts wo; long ret; ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru); if (ret) return ret; ret = do_wait(&wo); if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG)) ret = -EAGAIN; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, infop, int, options, struct rusage __user *, ru) { struct rusage r; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } long kernel_wait4(pid_t upid, int __user *stat_addr, int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; enum pid_type type; long ret; if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; /* -INT_MIN is not defined */ if (upid == INT_MIN) return -ESRCH; if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { type = PIDTYPE_PGID; pid = find_get_pid(-upid); } else if (upid == 0) { type = PIDTYPE_PGID; pid = get_task_pid(current, PIDTYPE_PGID); } else /* upid > 0 */ { type = PIDTYPE_PID; pid = find_get_pid(upid); } wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options | WEXITED; wo.wo_info = NULL; wo.wo_stat = 0; wo.wo_rusage = ru; ret = do_wait(&wo); put_pid(pid); if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) ret = -EFAULT; return ret; } int kernel_wait(pid_t pid, int *stat) { struct wait_opts wo = { .wo_type = PIDTYPE_PID, .wo_pid = find_get_pid(pid), .wo_flags = WEXITED, }; int ret; ret = do_wait(&wo); if (ret > 0 && wo.wo_stat) *stat = wo.wo_stat; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { struct rusage r; long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } return err; } #ifdef __ARCH_WANT_SYS_WAITPID /* * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. */ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { return kernel_wait4(pid, stat_addr, options, NULL); } #endif #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(wait4, compat_pid_t, pid, compat_uint_t __user *, stat_addr, int, options, struct compat_rusage __user *, ru) { struct rusage r; long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && put_compat_rusage(&r, ru)) return -EFAULT; } return err; } COMPAT_SYSCALL_DEFINE5(waitid, int, which, compat_pid_t, pid, struct compat_siginfo __user *, infop, int, options, struct compat_rusage __user *, uru) { struct rusage ru; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (uru) { /* kernel_waitid() overwrites everything in ru */ if (COMPAT_USE_64BIT_TIME) err = copy_to_user(uru, &ru, sizeof(ru)); else err = put_compat_rusage(&ru, uru); if (err) return -EFAULT; } } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } #endif /* * This needs to be __function_aligned as GCC implicitly makes any * implementation of abort() cold and drops alignment specified by * -falign-functions=N. * * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11 */ __weak __function_aligned void abort(void) { BUG(); /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } EXPORT_SYMBOL(abort);
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/mii.h: definitions for MII-compatible transceivers * Originally drivers/net/sunhme.h. * * Copyright (C) 1996, 1999, 2001 David S. Miller (davem@redhat.com) */ #ifndef __LINUX_MII_H__ #define __LINUX_MII_H__ #include <linux/if.h> #include <linux/linkmode.h> #include <uapi/linux/mii.h> struct ethtool_cmd; struct mii_if_info { int phy_id; int advertising; int phy_id_mask; int reg_num_mask; unsigned int full_duplex : 1; /* is full duplex? */ unsigned int force_media : 1; /* is autoneg. disabled? */ unsigned int supports_gmii : 1; /* are GMII registers supported? */ struct net_device *dev; int (*mdio_read) (struct net_device *dev, int phy_id, int location); void (*mdio_write) (struct net_device *dev, int phy_id, int location, int val); }; extern int mii_link_ok (struct mii_if_info *mii); extern int mii_nway_restart (struct mii_if_info *mii); extern void mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd); extern void mii_ethtool_get_link_ksettings( struct mii_if_info *mii, struct ethtool_link_ksettings *cmd); extern int mii_ethtool_sset(struct mii_if_info *mii, struct ethtool_cmd *ecmd); extern int mii_ethtool_set_link_ksettings( struct mii_if_info *mii, const struct ethtool_link_ksettings *cmd); extern int mii_check_gmii_support(struct mii_if_info *mii); extern void mii_check_link (struct mii_if_info *mii); extern unsigned int mii_check_media (struct mii_if_info *mii, unsigned int ok_to_print, unsigned int init_media); extern int generic_mii_ioctl(struct mii_if_info *mii_if, struct mii_ioctl_data *mii_data, int cmd, unsigned int *duplex_changed); static inline struct mii_ioctl_data *if_mii(struct ifreq *rq) { return (struct mii_ioctl_data *) &rq->ifr_ifru; } /** * mii_nway_result * @negotiated: value of MII ANAR and'd with ANLPAR * * Given a set of MII abilities, check each bit and returns the * currently supported media, in the priority order defined by * IEEE 802.3u. We use LPA_xxx constants but note this is not the * value of LPA solely, as described above. * * The one exception to IEEE 802.3u is that 100baseT4 is placed * between 100T-full and 100T-half. If your phy does not support * 100T4 this is fine. If your phy places 100T4 elsewhere in the * priority order, you will need to roll your own function. */ static inline unsigned int mii_nway_result (unsigned int negotiated) { unsigned int ret; if (negotiated & LPA_100FULL) ret = LPA_100FULL; else if (negotiated & LPA_100BASE4) ret = LPA_100BASE4; else if (negotiated & LPA_100HALF) ret = LPA_100HALF; else if (negotiated & LPA_10FULL) ret = LPA_10FULL; else ret = LPA_10HALF; return ret; } /** * mii_duplex * @duplex_lock: Non-zero if duplex is locked at full * @negotiated: value of MII ANAR and'd with ANLPAR * * A small helper function for a common case. Returns one * if the media is operating or locked at full duplex, and * returns zero otherwise. */ static inline unsigned int mii_duplex (unsigned int duplex_lock, unsigned int negotiated) { if (duplex_lock) return 1; if (mii_nway_result(negotiated) & LPA_DUPLEX) return 1; return 0; } /** * ethtool_adv_to_mii_adv_t * @ethadv: the ethtool advertisement settings * * A small helper function that translates ethtool advertisement * settings to phy autonegotiation advertisements for the * MII_ADVERTISE register. */ static inline u32 ethtool_adv_to_mii_adv_t(u32 ethadv) { u32 result = 0; if (ethadv & ADVERTISED_10baseT_Half) result |= ADVERTISE_10HALF; if (ethadv & ADVERTISED_10baseT_Full) result |= ADVERTISE_10FULL; if (ethadv & ADVERTISED_100baseT_Half) result |= ADVERTISE_100HALF; if (ethadv & ADVERTISED_100baseT_Full) result |= ADVERTISE_100FULL; if (ethadv & ADVERTISED_Pause) result |= ADVERTISE_PAUSE_CAP; if (ethadv & ADVERTISED_Asym_Pause) result |= ADVERTISE_PAUSE_ASYM; return result; } /** * linkmode_adv_to_mii_adv_t * @advertising: the linkmode advertisement settings * * A small helper function that translates linkmode advertisement * settings to phy autonegotiation advertisements for the * MII_ADVERTISE register. */ static inline u32 linkmode_adv_to_mii_adv_t(const unsigned long *advertising) { u32 result = 0; if (linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, advertising)) result |= ADVERTISE_10HALF; if (linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, advertising)) result |= ADVERTISE_10FULL; if (linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, advertising)) result |= ADVERTISE_100HALF; if (linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, advertising)) result |= ADVERTISE_100FULL; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising)) result |= ADVERTISE_PAUSE_CAP; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising)) result |= ADVERTISE_PAUSE_ASYM; return result; } /** * mii_adv_to_ethtool_adv_t * @adv: value of the MII_ADVERTISE register * * A small helper function that translates MII_ADVERTISE bits * to ethtool advertisement settings. */ static inline u32 mii_adv_to_ethtool_adv_t(u32 adv) { u32 result = 0; if (adv & ADVERTISE_10HALF) result |= ADVERTISED_10baseT_Half; if (adv & ADVERTISE_10FULL) result |= ADVERTISED_10baseT_Full; if (adv & ADVERTISE_100HALF) result |= ADVERTISED_100baseT_Half; if (adv & ADVERTISE_100FULL) result |= ADVERTISED_100baseT_Full; if (adv & ADVERTISE_PAUSE_CAP) result |= ADVERTISED_Pause; if (adv & ADVERTISE_PAUSE_ASYM) result |= ADVERTISED_Asym_Pause; return result; } /** * ethtool_adv_to_mii_ctrl1000_t * @ethadv: the ethtool advertisement settings * * A small helper function that translates ethtool advertisement * settings to phy autonegotiation advertisements for the * MII_CTRL1000 register when in 1000T mode. */ static inline u32 ethtool_adv_to_mii_ctrl1000_t(u32 ethadv) { u32 result = 0; if (ethadv & ADVERTISED_1000baseT_Half) result |= ADVERTISE_1000HALF; if (ethadv & ADVERTISED_1000baseT_Full) result |= ADVERTISE_1000FULL; return result; } /** * linkmode_adv_to_mii_ctrl1000_t * @advertising: the linkmode advertisement settings * * A small helper function that translates linkmode advertisement * settings to phy autonegotiation advertisements for the * MII_CTRL1000 register when in 1000T mode. */ static inline u32 linkmode_adv_to_mii_ctrl1000_t(const unsigned long *advertising) { u32 result = 0; if (linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising)) result |= ADVERTISE_1000HALF; if (linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising)) result |= ADVERTISE_1000FULL; return result; } /** * mii_ctrl1000_to_ethtool_adv_t * @adv: value of the MII_CTRL1000 register * * A small helper function that translates MII_CTRL1000 * bits, when in 1000Base-T mode, to ethtool * advertisement settings. */ static inline u32 mii_ctrl1000_to_ethtool_adv_t(u32 adv) { u32 result = 0; if (adv & ADVERTISE_1000HALF) result |= ADVERTISED_1000baseT_Half; if (adv & ADVERTISE_1000FULL) result |= ADVERTISED_1000baseT_Full; return result; } /** * mii_lpa_to_ethtool_lpa_t * @adv: value of the MII_LPA register * * A small helper function that translates MII_LPA * bits, when in 1000Base-T mode, to ethtool * LP advertisement settings. */ static inline u32 mii_lpa_to_ethtool_lpa_t(u32 lpa) { u32 result = 0; if (lpa & LPA_LPACK) result |= ADVERTISED_Autoneg; return result | mii_adv_to_ethtool_adv_t(lpa); } /** * mii_stat1000_to_ethtool_lpa_t * @adv: value of the MII_STAT1000 register * * A small helper function that translates MII_STAT1000 * bits, when in 1000Base-T mode, to ethtool * advertisement settings. */ static inline u32 mii_stat1000_to_ethtool_lpa_t(u32 lpa) { u32 result = 0; if (lpa & LPA_1000HALF) result |= ADVERTISED_1000baseT_Half; if (lpa & LPA_1000FULL) result |= ADVERTISED_1000baseT_Full; return result; } /** * mii_stat1000_mod_linkmode_lpa_t * @advertising: target the linkmode advertisement settings * @adv: value of the MII_STAT1000 register * * A small helper function that translates MII_STAT1000 bits, when in * 1000Base-T mode, to linkmode advertisement settings. Other bits in * advertising are not changes. */ static inline void mii_stat1000_mod_linkmode_lpa_t(unsigned long *advertising, u32 lpa) { linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising, lpa & LPA_1000HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising, lpa & LPA_1000FULL); } /** * ethtool_adv_to_mii_adv_x * @ethadv: the ethtool advertisement settings * * A small helper function that translates ethtool advertisement * settings to phy autonegotiation advertisements for the * MII_CTRL1000 register when in 1000Base-X mode. */ static inline u32 ethtool_adv_to_mii_adv_x(u32 ethadv) { u32 result = 0; if (ethadv & ADVERTISED_1000baseT_Half) result |= ADVERTISE_1000XHALF; if (ethadv & ADVERTISED_1000baseT_Full) result |= ADVERTISE_1000XFULL; if (ethadv & ADVERTISED_Pause) result |= ADVERTISE_1000XPAUSE; if (ethadv & ADVERTISED_Asym_Pause) result |= ADVERTISE_1000XPSE_ASYM; return result; } /** * mii_adv_to_ethtool_adv_x * @adv: value of the MII_CTRL1000 register * * A small helper function that translates MII_CTRL1000 * bits, when in 1000Base-X mode, to ethtool * advertisement settings. */ static inline u32 mii_adv_to_ethtool_adv_x(u32 adv) { u32 result = 0; if (adv & ADVERTISE_1000XHALF) result |= ADVERTISED_1000baseT_Half; if (adv & ADVERTISE_1000XFULL) result |= ADVERTISED_1000baseT_Full; if (adv & ADVERTISE_1000XPAUSE) result |= ADVERTISED_Pause; if (adv & ADVERTISE_1000XPSE_ASYM) result |= ADVERTISED_Asym_Pause; return result; } /** * mii_adv_mod_linkmode_adv_t * @advertising:pointer to destination link mode. * @adv: value of the MII_ADVERTISE register * * A small helper function that translates MII_ADVERTISE bits to * linkmode advertisement settings. Leaves other bits unchanged. */ static inline void mii_adv_mod_linkmode_adv_t(unsigned long *advertising, u32 adv) { linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, advertising, adv & ADVERTISE_10HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, advertising, adv & ADVERTISE_10FULL); linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, advertising, adv & ADVERTISE_100HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, advertising, adv & ADVERTISE_100FULL); linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising, adv & ADVERTISE_PAUSE_CAP); linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising, adv & ADVERTISE_PAUSE_ASYM); } /** * mii_adv_to_linkmode_adv_t * @advertising:pointer to destination link mode. * @adv: value of the MII_ADVERTISE register * * A small helper function that translates MII_ADVERTISE bits * to linkmode advertisement settings. Clears the old value * of advertising. */ static inline void mii_adv_to_linkmode_adv_t(unsigned long *advertising, u32 adv) { linkmode_zero(advertising); mii_adv_mod_linkmode_adv_t(advertising, adv); } /** * mii_lpa_to_linkmode_lpa_t * @adv: value of the MII_LPA register * * A small helper function that translates MII_LPA bits, when in * 1000Base-T mode, to linkmode LP advertisement settings. Clears the * old value of advertising */ static inline void mii_lpa_to_linkmode_lpa_t(unsigned long *lp_advertising, u32 lpa) { mii_adv_to_linkmode_adv_t(lp_advertising, lpa); if (lpa & LPA_LPACK) linkmode_set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, lp_advertising); } /** * mii_lpa_mod_linkmode_lpa_t * @adv: value of the MII_LPA register * * A small helper function that translates MII_LPA bits, when in * 1000Base-T mode, to linkmode LP advertisement settings. Leaves * other bits unchanged. */ static inline void mii_lpa_mod_linkmode_lpa_t(unsigned long *lp_advertising, u32 lpa) { mii_adv_mod_linkmode_adv_t(lp_advertising, lpa); linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, lp_advertising, lpa & LPA_LPACK); } static inline void mii_ctrl1000_mod_linkmode_adv_t(unsigned long *advertising, u32 ctrl1000) { linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising, ctrl1000 & ADVERTISE_1000HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising, ctrl1000 & ADVERTISE_1000FULL); } /** * linkmode_adv_to_lcl_adv_t * @advertising:pointer to linkmode advertising * * A small helper function that translates linkmode advertising to LVL * pause capabilities. */ static inline u32 linkmode_adv_to_lcl_adv_t(const unsigned long *advertising) { u32 lcl_adv = 0; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising)) lcl_adv |= ADVERTISE_PAUSE_CAP; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising)) lcl_adv |= ADVERTISE_PAUSE_ASYM; return lcl_adv; } /** * mii_lpa_mod_linkmode_x - decode the link partner's config_reg to linkmodes * @linkmodes: link modes array * @lpa: config_reg word from link partner * @fd_bit: link mode for 1000XFULL bit */ static inline void mii_lpa_mod_linkmode_x(unsigned long *linkmodes, u16 lpa, int fd_bit) { linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, linkmodes, lpa & LPA_LPACK); linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT, linkmodes, lpa & LPA_1000XPAUSE); linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, linkmodes, lpa & LPA_1000XPAUSE_ASYM); linkmode_mod_bit(fd_bit, linkmodes, lpa & LPA_1000XFULL); } /** * linkmode_adv_to_mii_adv_x - encode a linkmode to config_reg * @linkmodes: linkmodes * @fd_bit: full duplex bit */ static inline u16 linkmode_adv_to_mii_adv_x(const unsigned long *linkmodes, int fd_bit) { u16 adv = 0; if (linkmode_test_bit(fd_bit, linkmodes)) adv |= ADVERTISE_1000XFULL; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, linkmodes)) adv |= ADVERTISE_1000XPAUSE; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, linkmodes)) adv |= ADVERTISE_1000XPSE_ASYM; return adv; } /** * mii_advertise_flowctrl - get flow control advertisement flags * @cap: Flow control capabilities (FLOW_CTRL_RX, FLOW_CTRL_TX or both) */ static inline u16 mii_advertise_flowctrl(int cap) { u16 adv = 0; if (cap & FLOW_CTRL_RX) adv = ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM; if (cap & FLOW_CTRL_TX) adv ^= ADVERTISE_PAUSE_ASYM; return adv; } /** * mii_resolve_flowctrl_fdx * @lcladv: value of MII ADVERTISE register * @rmtadv: value of MII LPA register * * Resolve full duplex flow control as per IEEE 802.3-2005 table 28B-3 */ static inline u8 mii_resolve_flowctrl_fdx(u16 lcladv, u16 rmtadv) { u8 cap = 0; if (lcladv & rmtadv & ADVERTISE_PAUSE_CAP) { cap = FLOW_CTRL_TX | FLOW_CTRL_RX; } else if (lcladv & rmtadv & ADVERTISE_PAUSE_ASYM) { if (lcladv & ADVERTISE_PAUSE_CAP) cap = FLOW_CTRL_RX; else if (rmtadv & ADVERTISE_PAUSE_CAP) cap = FLOW_CTRL_TX; } return cap; } /** * mii_bmcr_encode_fixed - encode fixed speed/duplex settings to a BMCR value * @speed: a SPEED_* value * @duplex: a DUPLEX_* value * * Encode the speed and duplex to a BMCR value. 2500, 1000, 100 and 10 Mbps are * supported. 2500Mbps is encoded to 1000Mbps. Other speeds are encoded as 10 * Mbps. Unknown duplex values are encoded to half-duplex. */ static inline u16 mii_bmcr_encode_fixed(int speed, int duplex) { u16 bmcr; switch (speed) { case SPEED_2500: case SPEED_1000: bmcr = BMCR_SPEED1000; break; case SPEED_100: bmcr = BMCR_SPEED100; break; case SPEED_10: default: bmcr = BMCR_SPEED10; break; } if (duplex == DUPLEX_FULL) bmcr |= BMCR_FULLDPLX; return bmcr; } #endif /* __LINUX_MII_H__ */
2 2 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "cookie.h" #include "peer.h" #include "device.h" #include "messages.h" #include "ratelimiter.h" #include "timers.h" #include <crypto/blake2s.h> #include <crypto/chacha20poly1305.h> #include <crypto/utils.h> #include <net/ipv6.h> void wg_cookie_checker_init(struct cookie_checker *checker, struct wg_device *wg) { init_rwsem(&checker->secret_lock); checker->secret_birthdate = ktime_get_coarse_boottime_ns(); get_random_bytes(checker->secret, NOISE_HASH_LEN); checker->device = wg; } enum { COOKIE_KEY_LABEL_LEN = 8 }; static const u8 mac1_key_label[COOKIE_KEY_LABEL_LEN] __nonstring = "mac1----"; static const u8 cookie_key_label[COOKIE_KEY_LABEL_LEN] __nonstring = "cookie--"; static void precompute_key(u8 key[NOISE_SYMMETRIC_KEY_LEN], const u8 pubkey[NOISE_PUBLIC_KEY_LEN], const u8 label[COOKIE_KEY_LABEL_LEN]) { struct blake2s_ctx blake; blake2s_init(&blake, NOISE_SYMMETRIC_KEY_LEN); blake2s_update(&blake, label, COOKIE_KEY_LABEL_LEN); blake2s_update(&blake, pubkey, NOISE_PUBLIC_KEY_LEN); blake2s_final(&blake, key); } /* Must hold peer->handshake.static_identity->lock */ void wg_cookie_checker_precompute_device_keys(struct cookie_checker *checker) { if (likely(checker->device->static_identity.has_identity)) { precompute_key(checker->cookie_encryption_key, checker->device->static_identity.static_public, cookie_key_label); precompute_key(checker->message_mac1_key, checker->device->static_identity.static_public, mac1_key_label); } else { memset(checker->cookie_encryption_key, 0, NOISE_SYMMETRIC_KEY_LEN); memset(checker->message_mac1_key, 0, NOISE_SYMMETRIC_KEY_LEN); } } void wg_cookie_checker_precompute_peer_keys(struct wg_peer *peer) { precompute_key(peer->latest_cookie.cookie_decryption_key, peer->handshake.remote_static, cookie_key_label); precompute_key(peer->latest_cookie.message_mac1_key, peer->handshake.remote_static, mac1_key_label); } void wg_cookie_init(struct cookie *cookie) { memset(cookie, 0, sizeof(*cookie)); init_rwsem(&cookie->lock); } static void compute_mac1(u8 mac1[COOKIE_LEN], const void *message, size_t len, const u8 key[NOISE_SYMMETRIC_KEY_LEN]) { len = len - sizeof(struct message_macs) + offsetof(struct message_macs, mac1); blake2s(key, NOISE_SYMMETRIC_KEY_LEN, message, len, mac1, COOKIE_LEN); } static void compute_mac2(u8 mac2[COOKIE_LEN], const void *message, size_t len, const u8 cookie[COOKIE_LEN]) { len = len - sizeof(struct message_macs) + offsetof(struct message_macs, mac2); blake2s(cookie, COOKIE_LEN, message, len, mac2, COOKIE_LEN); } static void make_cookie(u8 cookie[COOKIE_LEN], struct sk_buff *skb, struct cookie_checker *checker) { struct blake2s_ctx blake; if (wg_birthdate_has_expired(checker->secret_birthdate, COOKIE_SECRET_MAX_AGE)) { down_write(&checker->secret_lock); checker->secret_birthdate = ktime_get_coarse_boottime_ns(); get_random_bytes(checker->secret, NOISE_HASH_LEN); up_write(&checker->secret_lock); } down_read(&checker->secret_lock); blake2s_init_key(&blake, COOKIE_LEN, checker->secret, NOISE_HASH_LEN); if (skb->protocol == htons(ETH_P_IP)) blake2s_update(&blake, (u8 *)&ip_hdr(skb)->saddr, sizeof(struct in_addr)); else if (skb->protocol == htons(ETH_P_IPV6)) blake2s_update(&blake, (u8 *)&ipv6_hdr(skb)->saddr, sizeof(struct in6_addr)); blake2s_update(&blake, (u8 *)&udp_hdr(skb)->source, sizeof(__be16)); blake2s_final(&blake, cookie); up_read(&checker->secret_lock); } enum cookie_mac_state wg_cookie_validate_packet(struct cookie_checker *checker, struct sk_buff *skb, bool check_cookie) { struct message_macs *macs = (struct message_macs *) (skb->data + skb->len - sizeof(*macs)); enum cookie_mac_state ret; u8 computed_mac[COOKIE_LEN]; u8 cookie[COOKIE_LEN]; ret = INVALID_MAC; compute_mac1(computed_mac, skb->data, skb->len, checker->message_mac1_key); if (crypto_memneq(computed_mac, macs->mac1, COOKIE_LEN)) goto out; ret = VALID_MAC_BUT_NO_COOKIE; if (!check_cookie) goto out; make_cookie(cookie, skb, checker); compute_mac2(computed_mac, skb->data, skb->len, cookie); if (crypto_memneq(computed_mac, macs->mac2, COOKIE_LEN)) goto out; ret = VALID_MAC_WITH_COOKIE_BUT_RATELIMITED; if (!wg_ratelimiter_allow(skb, dev_net(checker->device->dev))) goto out; ret = VALID_MAC_WITH_COOKIE; out: return ret; } void wg_cookie_add_mac_to_packet(void *message, size_t len, struct wg_peer *peer) { struct message_macs *macs = (struct message_macs *) ((u8 *)message + len - sizeof(*macs)); down_write(&peer->latest_cookie.lock); compute_mac1(macs->mac1, message, len, peer->latest_cookie.message_mac1_key); memcpy(peer->latest_cookie.last_mac1_sent, macs->mac1, COOKIE_LEN); peer->latest_cookie.have_sent_mac1 = true; up_write(&peer->latest_cookie.lock); down_read(&peer->latest_cookie.lock); if (peer->latest_cookie.is_valid && !wg_birthdate_has_expired(peer->latest_cookie.birthdate, COOKIE_SECRET_MAX_AGE - COOKIE_SECRET_LATENCY)) compute_mac2(macs->mac2, message, len, peer->latest_cookie.cookie); else memset(macs->mac2, 0, COOKIE_LEN); up_read(&peer->latest_cookie.lock); } void wg_cookie_message_create(struct message_handshake_cookie *dst, struct sk_buff *skb, __le32 index, struct cookie_checker *checker) { struct message_macs *macs = (struct message_macs *) ((u8 *)skb->data + skb->len - sizeof(*macs)); u8 cookie[COOKIE_LEN]; dst->header.type = cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE); dst->receiver_index = index; get_random_bytes_wait(dst->nonce, COOKIE_NONCE_LEN); make_cookie(cookie, skb, checker); xchacha20poly1305_encrypt(dst->encrypted_cookie, cookie, COOKIE_LEN, macs->mac1, COOKIE_LEN, dst->nonce, checker->cookie_encryption_key); } void wg_cookie_message_consume(struct message_handshake_cookie *src, struct wg_device *wg) { struct wg_peer *peer = NULL; u8 cookie[COOKIE_LEN]; bool ret; if (unlikely(!wg_index_hashtable_lookup(wg->index_hashtable, INDEX_HASHTABLE_HANDSHAKE | INDEX_HASHTABLE_KEYPAIR, src->receiver_index, &peer))) return; down_read(&peer->latest_cookie.lock); if (unlikely(!peer->latest_cookie.have_sent_mac1)) { up_read(&peer->latest_cookie.lock); goto out; } ret = xchacha20poly1305_decrypt( cookie, src->encrypted_cookie, sizeof(src->encrypted_cookie), peer->latest_cookie.last_mac1_sent, COOKIE_LEN, src->nonce, peer->latest_cookie.cookie_decryption_key); up_read(&peer->latest_cookie.lock); if (ret) { down_write(&peer->latest_cookie.lock); memcpy(peer->latest_cookie.cookie, cookie, COOKIE_LEN); peer->latest_cookie.birthdate = ktime_get_coarse_boottime_ns(); peer->latest_cookie.is_valid = true; peer->latest_cookie.have_sent_mac1 = false; up_write(&peer->latest_cookie.lock); } else { net_dbg_ratelimited("%s: Could not decrypt invalid cookie response\n", wg->dev->name); } out: wg_peer_put(peer); }
5 5 5 4 3 3 3 2 1 1 1 2 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/if.h> #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/tcp.h> #include <net/ip.h> #include <net/tcp.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_log.h> #include <linux/netfilter/nfnetlink_osf.h> /* * Indexed by dont-fragment bit. * It is the only constant value in the fingerprint. */ struct list_head nf_osf_fingers[2]; EXPORT_SYMBOL_GPL(nf_osf_fingers); static inline int nf_osf_ttl(const struct sk_buff *skb, int ttl_check, unsigned char f_ttl) { struct in_device *in_dev = __in_dev_get_rcu(skb->dev); const struct iphdr *ip = ip_hdr(skb); const struct in_ifaddr *ifa; int ret = 0; if (ttl_check == NF_OSF_TTL_TRUE) return ip->ttl == f_ttl; if (ttl_check == NF_OSF_TTL_NOCHECK) return 1; else if (ip->ttl <= f_ttl) return 1; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(ip->saddr, ifa)) { ret = (ip->ttl == f_ttl); break; } } return ret; } struct nf_osf_hdr_ctx { bool df; u16 window; u16 totlen; const unsigned char *optp; unsigned int optsize; }; static bool nf_osf_match_one(const struct sk_buff *skb, const struct nf_osf_user_finger *f, int ttl_check, struct nf_osf_hdr_ctx *ctx) { const __u8 *optpinit = ctx->optp; unsigned int check_WSS = 0; int fmatch = FMATCH_WRONG; int foptsize, optnum; u16 mss = 0; if (ctx->totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl)) return false; /* * Should not happen if userspace parser was written correctly. */ if (f->wss.wc >= OSF_WSS_MAX) return false; /* Check options */ foptsize = 0; for (optnum = 0; optnum < f->opt_num; ++optnum) foptsize += f->opt[optnum].length; if (foptsize > MAX_IPOPTLEN || ctx->optsize > MAX_IPOPTLEN || ctx->optsize != foptsize) return false; check_WSS = f->wss.wc; for (optnum = 0; optnum < f->opt_num; ++optnum) { if (f->opt[optnum].kind == *ctx->optp) { __u32 len = f->opt[optnum].length; const __u8 *optend = ctx->optp + len; fmatch = FMATCH_OK; switch (*ctx->optp) { case OSFOPT_MSS: mss = ctx->optp[3]; mss <<= 8; mss |= ctx->optp[2]; mss = ntohs((__force __be16)mss); break; case OSFOPT_TS: break; } ctx->optp = optend; } else fmatch = FMATCH_OPT_WRONG; if (fmatch != FMATCH_OK) break; } if (fmatch != FMATCH_OPT_WRONG) { fmatch = FMATCH_WRONG; switch (check_WSS) { case OSF_WSS_PLAIN: if (f->wss.val == 0 || ctx->window == f->wss.val) fmatch = FMATCH_OK; break; case OSF_WSS_MSS: /* * Some smart modems decrease mangle MSS to * SMART_MSS_2, so we check standard, decreased * and the one provided in the fingerprint MSS * values. */ #define SMART_MSS_1 1460 #define SMART_MSS_2 1448 if (ctx->window == f->wss.val * mss || ctx->window == f->wss.val * SMART_MSS_1 || ctx->window == f->wss.val * SMART_MSS_2) fmatch = FMATCH_OK; break; case OSF_WSS_MTU: if (ctx->window == f->wss.val * (mss + 40) || ctx->window == f->wss.val * (SMART_MSS_1 + 40) || ctx->window == f->wss.val * (SMART_MSS_2 + 40)) fmatch = FMATCH_OK; break; case OSF_WSS_MODULO: if ((ctx->window % f->wss.val) == 0) fmatch = FMATCH_OK; break; } } if (fmatch != FMATCH_OK) ctx->optp = optpinit; return fmatch == FMATCH_OK; } static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx, const struct sk_buff *skb, const struct iphdr *ip, unsigned char *opts, struct tcphdr *_tcph) { const struct tcphdr *tcp; tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), _tcph); if (!tcp) return NULL; if (!tcp->syn) return NULL; ctx->totlen = ntohs(ip->tot_len); ctx->df = ntohs(ip->frag_off) & IP_DF; ctx->window = ntohs(tcp->window); if (tcp->doff * 4 > sizeof(struct tcphdr)) { ctx->optsize = tcp->doff * 4 - sizeof(struct tcphdr); ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) + sizeof(struct tcphdr), ctx->optsize, opts); if (!ctx->optp) return NULL; } return tcp; } bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, int hooknum, struct net_device *in, struct net_device *out, const struct nf_osf_info *info, struct net *net, const struct list_head *nf_osf_fingers) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; unsigned char opts[MAX_IPOPTLEN]; const struct nf_osf_finger *kf; int fcount = 0, ttl_check; int fmatch = FMATCH_WRONG; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; memset(&ctx, 0, sizeof(ctx)); tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return false; ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : 0; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre)) continue; if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; fmatch = FMATCH_OK; fcount++; if (info->flags & NF_OSF_LOG) nf_log_packet(net, family, hooknum, skb, in, out, NULL, "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", f->genre, f->version, f->subtype, &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest), f->ttl - ip->ttl); if ((info->flags & NF_OSF_LOG) && info->loglevel == NF_OSF_LOGLEVEL_FIRST) break; } if (!fcount && (info->flags & NF_OSF_LOG)) nf_log_packet(net, family, hooknum, skb, in, out, NULL, "Remote OS is not known: %pI4:%u -> %pI4:%u\n", &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest)); if (fcount) fmatch = FMATCH_OK; return fmatch == FMATCH_OK; } EXPORT_SYMBOL_GPL(nf_osf_match); bool nf_osf_find(const struct sk_buff *skb, const struct list_head *nf_osf_fingers, const int ttl_check, struct nf_osf_data *data) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; unsigned char opts[MAX_IPOPTLEN]; const struct nf_osf_finger *kf; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; bool found = false; memset(&ctx, 0, sizeof(ctx)); tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return false; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; data->genre = f->genre; data->version = f->version; found = true; break; } return found; } EXPORT_SYMBOL_GPL(nf_osf_find); static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = { [OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) }, }; static int nfnl_osf_add_callback(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *kf = NULL, *sf; int err = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); if (f->opt_num > ARRAY_SIZE(f->opt)) return -EINVAL; if (!memchr(f->genre, 0, MAXGENRELEN) || !memchr(f->subtype, 0, MAXGENRELEN) || !memchr(f->version, 0, MAXGENRELEN)) return -EINVAL; kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL); if (!kf) return -ENOMEM; memcpy(&kf->finger, f, sizeof(struct nf_osf_user_finger)); list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) { if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger))) continue; kfree(kf); kf = NULL; if (info->nlh->nlmsg_flags & NLM_F_EXCL) err = -EEXIST; break; } /* * We are protected by nfnl mutex. */ if (kf) list_add_tail_rcu(&kf->finger_entry, &nf_osf_fingers[!!f->df]); return err; } static int nfnl_osf_remove_callback(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *sf; int err = -ENOENT; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) { if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger))) continue; /* * We are protected by nfnl mutex. */ list_del_rcu(&sf->finger_entry); kfree_rcu(sf, rcu_head); err = 0; break; } return err; } static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = { [OSF_MSG_ADD] = { .call = nfnl_osf_add_callback, .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, [OSF_MSG_REMOVE] = { .call = nfnl_osf_remove_callback, .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, }; static const struct nfnetlink_subsystem nfnl_osf_subsys = { .name = "osf", .subsys_id = NFNL_SUBSYS_OSF, .cb_count = OSF_MSG_MAX, .cb = nfnl_osf_callbacks, }; static int __init nfnl_osf_init(void) { int err = -EINVAL; int i; for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) INIT_LIST_HEAD(&nf_osf_fingers[i]); err = nfnetlink_subsys_register(&nfnl_osf_subsys); if (err < 0) { pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err); goto err_out_exit; } return 0; err_out_exit: return err; } static void __exit nfnl_osf_fini(void) { struct nf_osf_finger *f; int i; nfnetlink_subsys_unregister(&nfnl_osf_subsys); rcu_read_lock(); for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) { list_for_each_entry_rcu(f, &nf_osf_fingers[i], finger_entry) { list_del_rcu(&f->finger_entry); kfree_rcu(f, rcu_head); } } rcu_read_unlock(); rcu_barrier(); } module_init(nfnl_osf_init); module_exit(nfnl_osf_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Passive OS fingerprint matching"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
2 1 5 5 5 1 5 1 1 1 4 1 5 5 4 1 1 5 4 1 1 1 1 5 5 5 1 6 6 6 5 5 5 1 5 5 5 5 5 6 5 5 5 5 5 6 4 4 2 4 4 4 4 4 1 4 4 4 4 4 1 4 4 4 4 4 4 4 4 4 3 4 4 4 4 4 1 1 1 1 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 3 3 3 3 3 2 2 2 3 3 2 3 1 3 5 4 5 1 5 4 4 4 4 4 1 3 1 1 1 2 1 1 4 4 4 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 6 6 6 5 5 4 4 4 4 4 4 4 4 5 2 2 2 2 2 2 2 2 2 5 6 6 10 10 10 10 1 1 1 1 1 1 1 3 5 7 7 6 7 5 5 5 2 2 2 2 2 3 4 2 2 2 2 2 2 5 5 5 5 5 5 5 3 1 1 1 1 1 1 1 4 4 1 1 4 3 3 3 4 5 5 4 3 2 4 2 2 13 12 11 11 11 11 11 10 3 8 1 8 2 2 2 2 2 2 2 14 13 14 8 2 8 7 7 7 7 7 2 2 7 2 7 2 2 2 7 6 8 6 8 8 8 15 15 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 // SPDX-License-Identifier: GPL-2.0 #include <linux/pagewalk.h> #include <linux/mm_inline.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/mount.h> #include <linux/ksm.h> #include <linux/seq_file.h> #include <linux/highmem.h> #include <linux/ptrace.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/sched/mm.h> #include <linux/leafops.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> #include <linux/shmem_fs.h> #include <linux/uaccess.h> #include <linux/pkeys.h> #include <linux/minmax.h> #include <linux/overflow.h> #include <linux/buildid.h> #include <asm/elf.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include "internal.h" #define SENTINEL_VMA_END -1 #define SENTINEL_VMA_GATE -2 #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) void task_mem(struct seq_file *m, struct mm_struct *mm) { unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; anon = get_mm_counter_sum(mm, MM_ANONPAGES); file = get_mm_counter_sum(mm, MM_FILEPAGES); shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES); /* * Note: to minimize their overhead, mm maintains hiwater_vm and * hiwater_rss only when about to *lower* total_vm or rss. Any * collector of these hiwater stats must therefore get total_vm * and rss too, which will usually be the higher. Barriers? not * worth the effort, such snapshots can always be inconsistent. */ hiwater_vm = total_vm = mm->total_vm; if (hiwater_vm < mm->hiwater_vm) hiwater_vm = mm->hiwater_vm; hiwater_rss = total_rss = anon + file + shmem; if (hiwater_rss < mm->hiwater_rss) hiwater_rss = mm->hiwater_rss; /* split executable areas between text and lib */ text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK); text = min(text, mm->exec_vm << PAGE_SHIFT); lib = (mm->exec_vm << PAGE_SHIFT) - text; swap = get_mm_counter_sum(mm, MM_SWAPENTS); SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); SEQ_PUT_DEC(" kB\nRssFile:\t", file); SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem); SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm); SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm); seq_put_decimal_ull_width(m, " kB\nVmExe:\t", text >> 10, 8); seq_put_decimal_ull_width(m, " kB\nVmLib:\t", lib >> 10, 8); seq_put_decimal_ull_width(m, " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); seq_puts(m, " kB\n"); hugetlb_report_usage(m, mm); } #undef SEQ_PUT_DEC unsigned long task_vsize(struct mm_struct *mm) { return PAGE_SIZE * mm->total_vm; } unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { *shared = get_mm_counter_sum(mm, MM_FILEPAGES) + get_mm_counter_sum(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->data_vm + mm->stack_vm; *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES); return mm->total_vm; } #ifdef CONFIG_NUMA /* * Save get_task_policy() for show_numa_map(). */ static void hold_task_mempolicy(struct proc_maps_private *priv) { struct task_struct *task = priv->task; task_lock(task); priv->task_mempolicy = get_task_policy(task); mpol_get(priv->task_mempolicy); task_unlock(task); } static void release_task_mempolicy(struct proc_maps_private *priv) { mpol_put(priv->task_mempolicy); } #else static void hold_task_mempolicy(struct proc_maps_private *priv) { } static void release_task_mempolicy(struct proc_maps_private *priv) { } #endif #ifdef CONFIG_PER_VMA_LOCK static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx) { lock_ctx->locked_vma = NULL; lock_ctx->mmap_locked = false; } static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx) { if (lock_ctx->locked_vma) { vma_end_read(lock_ctx->locked_vma); lock_ctx->locked_vma = NULL; } } static const struct seq_operations proc_pid_maps_op; static inline bool lock_vma_range(struct seq_file *m, struct proc_maps_locking_ctx *lock_ctx) { /* * smaps and numa_maps perform page table walk, therefore require * mmap_lock but maps can be read with locking just the vma and * walking the vma tree under rcu read protection. */ if (m->op != &proc_pid_maps_op) { if (mmap_read_lock_killable(lock_ctx->mm)) return false; lock_ctx->mmap_locked = true; } else { rcu_read_lock(); reset_lock_ctx(lock_ctx); } return true; } static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) { if (lock_ctx->mmap_locked) { mmap_read_unlock(lock_ctx->mm); } else { unlock_ctx_vma(lock_ctx); rcu_read_unlock(); } } static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, loff_t last_pos) { struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx; struct vm_area_struct *vma; if (lock_ctx->mmap_locked) return vma_next(&priv->iter); unlock_ctx_vma(lock_ctx); vma = lock_next_vma(lock_ctx->mm, &priv->iter, last_pos); if (!IS_ERR_OR_NULL(vma)) lock_ctx->locked_vma = vma; return vma; } static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, loff_t pos) { struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx; if (lock_ctx->mmap_locked) return false; rcu_read_unlock(); mmap_read_lock(lock_ctx->mm); /* Reinitialize the iterator after taking mmap_lock */ vma_iter_set(&priv->iter, pos); lock_ctx->mmap_locked = true; return true; } #else /* CONFIG_PER_VMA_LOCK */ static inline bool lock_vma_range(struct seq_file *m, struct proc_maps_locking_ctx *lock_ctx) { return mmap_read_lock_killable(lock_ctx->mm) == 0; } static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) { mmap_read_unlock(lock_ctx->mm); } static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, loff_t last_pos) { return vma_next(&priv->iter); } static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, loff_t pos) { return false; } #endif /* CONFIG_PER_VMA_LOCK */ static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; struct vm_area_struct *vma; retry: vma = get_next_vma(priv, *ppos); /* EINTR of EAGAIN is possible */ if (IS_ERR(vma)) { if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos)) goto retry; return vma; } /* Store previous position to be able to restart if needed */ priv->last_pos = *ppos; if (vma) { /* * Track the end of the reported vma to ensure position changes * even if previous vma was merged with the next vma and we * found the extended vma with the same vm_start. */ *ppos = vma->vm_end; } else { *ppos = SENTINEL_VMA_GATE; vma = get_gate_vma(priv->lock_ctx.mm); } return vma; } static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; struct proc_maps_locking_ctx *lock_ctx; loff_t last_addr = *ppos; struct mm_struct *mm; /* See m_next(). Zero at the start or after lseek. */ if (last_addr == SENTINEL_VMA_END) return NULL; priv->task = get_proc_task(priv->inode); if (!priv->task) return ERR_PTR(-ESRCH); lock_ctx = &priv->lock_ctx; mm = lock_ctx->mm; if (!mm || !mmget_not_zero(mm)) { put_task_struct(priv->task); priv->task = NULL; return NULL; } if (!lock_vma_range(m, lock_ctx)) { mmput(mm); put_task_struct(priv->task); priv->task = NULL; return ERR_PTR(-EINTR); } /* * Reset current position if last_addr was set before * and it's not a sentinel. */ if (last_addr > 0) *ppos = last_addr = priv->last_pos; vma_iter_init(&priv->iter, mm, (unsigned long)last_addr); hold_task_mempolicy(priv); if (last_addr == SENTINEL_VMA_GATE) return get_gate_vma(mm); return proc_get_vma(m, ppos); } static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { if (*ppos == SENTINEL_VMA_GATE) { *ppos = SENTINEL_VMA_END; return NULL; } return proc_get_vma(m, ppos); } static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mm_struct *mm = priv->lock_ctx.mm; if (!priv->task) return; release_task_mempolicy(priv); unlock_vma_range(&priv->lock_ctx); mmput(mm); put_task_struct(priv->task); priv->task = NULL; } static int proc_maps_open(struct inode *inode, struct file *file, const struct seq_operations *ops, int psize) { struct proc_maps_private *priv = __seq_open_private(file, ops, psize); if (!priv) return -ENOMEM; priv->inode = inode; priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR(priv->lock_ctx.mm)) { int err = PTR_ERR(priv->lock_ctx.mm); seq_release_private(inode, file); return err; } return 0; } static int proc_map_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; if (priv->lock_ctx.mm) mmdrop(priv->lock_ctx.mm); return seq_release_private(inode, file); } static int do_maps_open(struct inode *inode, struct file *file, const struct seq_operations *ops) { return proc_maps_open(inode, file, ops, sizeof(struct proc_maps_private)); } static void get_vma_name(struct vm_area_struct *vma, const struct path **path, const char **name, const char **name_fmt) { struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL; *name = NULL; *path = NULL; *name_fmt = NULL; /* * Print the dentry name for named mappings, and a * special [heap] marker for the heap: */ if (vma->vm_file) { /* * If user named this anon shared memory via * prctl(PR_SET_VMA ..., use the provided name. */ if (anon_name) { *name_fmt = "[anon_shmem:%s]"; *name = anon_name->name; } else { *path = file_user_path(vma->vm_file); } return; } if (vma->vm_ops && vma->vm_ops->name) { *name = vma->vm_ops->name(vma); if (*name) return; } *name = arch_vma_name(vma); if (*name) return; if (!vma->vm_mm) { *name = "[vdso]"; return; } if (vma_is_initial_heap(vma)) { *name = "[heap]"; return; } if (vma_is_initial_stack(vma)) { *name = "[stack]"; return; } if (anon_name) { *name_fmt = "[anon:%s]"; *name = anon_name->name; return; } } static void show_vma_header_prefix(struct seq_file *m, unsigned long start, unsigned long end, vm_flags_t flags, unsigned long long pgoff, dev_t dev, unsigned long ino) { seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_put_hex_ll(m, NULL, start, 8); seq_put_hex_ll(m, "-", end, 8); seq_putc(m, ' '); seq_putc(m, flags & VM_READ ? 'r' : '-'); seq_putc(m, flags & VM_WRITE ? 'w' : '-'); seq_putc(m, flags & VM_EXEC ? 'x' : '-'); seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); seq_put_hex_ll(m, " ", pgoff, 8); seq_put_hex_ll(m, " ", MAJOR(dev), 2); seq_put_hex_ll(m, ":", MINOR(dev), 2); seq_put_decimal_ull(m, " ", ino); seq_putc(m, ' '); } static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { const struct path *path; const char *name_fmt, *name; vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; unsigned long start, end; dev_t dev = 0; if (vma->vm_file) { const struct inode *inode = file_user_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; } start = vma->vm_start; end = vma->vm_end; show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); get_vma_name(vma, &path, &name, &name_fmt); if (path) { seq_pad(m, ' '); seq_path(m, path, "\n"); } else if (name_fmt) { seq_pad(m, ' '); seq_printf(m, name_fmt, name); } else if (name) { seq_pad(m, ' '); seq_puts(m, name); } seq_putc(m, '\n'); } static int show_map(struct seq_file *m, void *v) { show_map_vma(m, v); return 0; } static const struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_map }; static int pid_maps_open(struct inode *inode, struct file *file) { return do_maps_open(inode, file, &proc_pid_maps_op); } #define PROCMAP_QUERY_VMA_FLAGS ( \ PROCMAP_QUERY_VMA_READABLE | \ PROCMAP_QUERY_VMA_WRITABLE | \ PROCMAP_QUERY_VMA_EXECUTABLE | \ PROCMAP_QUERY_VMA_SHARED \ ) #define PROCMAP_QUERY_VALID_FLAGS_MASK ( \ PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \ PROCMAP_QUERY_FILE_BACKED_VMA | \ PROCMAP_QUERY_VMA_FLAGS \ ) #ifdef CONFIG_PER_VMA_LOCK static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) { reset_lock_ctx(lock_ctx); return 0; } static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) { if (lock_ctx->mmap_locked) { mmap_read_unlock(lock_ctx->mm); lock_ctx->mmap_locked = false; } else { unlock_ctx_vma(lock_ctx); } } static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, unsigned long addr) { struct mm_struct *mm = lock_ctx->mm; struct vm_area_struct *vma; struct vma_iterator vmi; if (lock_ctx->mmap_locked) return find_vma(mm, addr); /* Unlock previously locked VMA and find the next one under RCU */ unlock_ctx_vma(lock_ctx); rcu_read_lock(); vma_iter_init(&vmi, mm, addr); vma = lock_next_vma(mm, &vmi, addr); rcu_read_unlock(); if (!vma) return NULL; if (!IS_ERR(vma)) { lock_ctx->locked_vma = vma; return vma; } if (PTR_ERR(vma) == -EAGAIN) { /* Fallback to mmap_lock on vma->vm_refcnt overflow */ mmap_read_lock(mm); vma = find_vma(mm, addr); lock_ctx->mmap_locked = true; } return vma; } #else /* CONFIG_PER_VMA_LOCK */ static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) { return mmap_read_lock_killable(lock_ctx->mm); } static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) { mmap_read_unlock(lock_ctx->mm); } static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, unsigned long addr) { return find_vma(lock_ctx->mm, addr); } #endif /* CONFIG_PER_VMA_LOCK */ static struct vm_area_struct *query_matching_vma(struct proc_maps_locking_ctx *lock_ctx, unsigned long addr, u32 flags) { struct vm_area_struct *vma; next_vma: vma = query_vma_find_by_addr(lock_ctx, addr); if (IS_ERR(vma)) return vma; if (!vma) goto no_vma; /* user requested only file-backed VMA, keep iterating */ if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file) goto skip_vma; /* VMA permissions should satisfy query flags */ if (flags & PROCMAP_QUERY_VMA_FLAGS) { u32 perm = 0; if (flags & PROCMAP_QUERY_VMA_READABLE) perm |= VM_READ; if (flags & PROCMAP_QUERY_VMA_WRITABLE) perm |= VM_WRITE; if (flags & PROCMAP_QUERY_VMA_EXECUTABLE) perm |= VM_EXEC; if (flags & PROCMAP_QUERY_VMA_SHARED) perm |= VM_MAYSHARE; if ((vma->vm_flags & perm) != perm) goto skip_vma; } /* found covering VMA or user is OK with the matching next VMA */ if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr) return vma; skip_vma: /* * If the user needs closest matching VMA, keep iterating. */ addr = vma->vm_end; if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) goto next_vma; no_vma: return ERR_PTR(-ENOENT); } static int do_procmap_query(struct mm_struct *mm, void __user *uarg) { struct proc_maps_locking_ctx lock_ctx = { .mm = mm }; struct procmap_query karg; struct vm_area_struct *vma; const char *name = NULL; char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; __u64 usize; int err; if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize))) return -EFAULT; /* argument struct can never be that large, reject abuse */ if (usize > PAGE_SIZE) return -E2BIG; /* argument struct should have at least query_flags and query_addr fields */ if (usize < offsetofend(struct procmap_query, query_addr)) return -EINVAL; err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); if (err) return err; /* reject unknown flags */ if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK) return -EINVAL; /* either both buffer address and size are set, or both should be zero */ if (!!karg.vma_name_size != !!karg.vma_name_addr) return -EINVAL; if (!!karg.build_id_size != !!karg.build_id_addr) return -EINVAL; if (!mm || !mmget_not_zero(mm)) return -ESRCH; err = query_vma_setup(&lock_ctx); if (err) { mmput(mm); return err; } vma = query_matching_vma(&lock_ctx, karg.query_addr, karg.query_flags); if (IS_ERR(vma)) { err = PTR_ERR(vma); vma = NULL; goto out; } karg.vma_start = vma->vm_start; karg.vma_end = vma->vm_end; karg.vma_flags = 0; if (vma->vm_flags & VM_READ) karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE; if (vma->vm_flags & VM_WRITE) karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE; if (vma->vm_flags & VM_EXEC) karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE; if (vma->vm_flags & VM_MAYSHARE) karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED; karg.vma_page_size = vma_kernel_pagesize(vma); if (vma->vm_file) { const struct inode *inode = file_user_inode(vma->vm_file); karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT; karg.dev_major = MAJOR(inode->i_sb->s_dev); karg.dev_minor = MINOR(inode->i_sb->s_dev); karg.inode = inode->i_ino; } else { karg.vma_offset = 0; karg.dev_major = 0; karg.dev_minor = 0; karg.inode = 0; } if (karg.build_id_size) { __u32 build_id_sz; err = build_id_parse(vma, build_id_buf, &build_id_sz); if (err) { karg.build_id_size = 0; } else { if (karg.build_id_size < build_id_sz) { err = -ENAMETOOLONG; goto out; } karg.build_id_size = build_id_sz; } } if (karg.vma_name_size) { size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size); const struct path *path; const char *name_fmt; size_t name_sz = 0; get_vma_name(vma, &path, &name, &name_fmt); if (path || name_fmt || name) { name_buf = kmalloc(name_buf_sz, GFP_KERNEL); if (!name_buf) { err = -ENOMEM; goto out; } } if (path) { name = d_path(path, name_buf, name_buf_sz); if (IS_ERR(name)) { err = PTR_ERR(name); goto out; } name_sz = name_buf + name_buf_sz - name; } else if (name || name_fmt) { name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name); name = name_buf; } if (name_sz > name_buf_sz) { err = -ENAMETOOLONG; goto out; } karg.vma_name_size = name_sz; } /* unlock vma or mmap_lock, and put mm_struct before copying data to user */ query_vma_teardown(&lock_ctx); mmput(mm); if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr), name, karg.vma_name_size)) { kfree(name_buf); return -EFAULT; } kfree(name_buf); if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr), build_id_buf, karg.build_id_size)) return -EFAULT; if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize))) return -EFAULT; return 0; out: query_vma_teardown(&lock_ctx); mmput(mm); kfree(name_buf); return err; } static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; switch (cmd) { case PROCMAP_QUERY: /* priv->lock_ctx.mm is set during file open operation */ return do_procmap_query(priv->lock_ctx.mm, (void __user *)arg); default: return -ENOIOCTLCMD; } } const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, .unlocked_ioctl = procfs_procmap_ioctl, .compat_ioctl = compat_ptr_ioctl, }; /* * Proportional Set Size(PSS): my share of RSS. * * PSS of a process is the count of pages it has in memory, where each * page is divided by the number of processes sharing it. So if a * process has 1000 pages all to itself, and 1000 shared with one other * process, its PSS will be 1500. * * To keep (accumulated) division errors low, we adopt a 64bit * fixed-point pss counter to minimize division errors. So (pss >> * PSS_SHIFT) would be the real byte count. * * A shift of 12 before division means (assuming 4K page size): * - 1M 3-user-pages add up to 8KB errors; * - supports mapcount up to 2^24, or 16M; * - supports PSS up to 2^52 bytes, or 4PB. */ #define PSS_SHIFT 12 #ifdef CONFIG_PROC_PAGE_MONITOR struct mem_size_stats { unsigned long resident; unsigned long shared_clean; unsigned long shared_dirty; unsigned long private_clean; unsigned long private_dirty; unsigned long referenced; unsigned long anonymous; unsigned long lazyfree; unsigned long anonymous_thp; unsigned long shmem_thp; unsigned long file_thp; unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; unsigned long ksm; u64 pss; u64 pss_anon; u64 pss_file; u64 pss_shmem; u64 pss_dirty; u64 pss_locked; u64 swap_pss; }; static void smaps_page_accumulate(struct mem_size_stats *mss, struct folio *folio, unsigned long size, unsigned long pss, bool dirty, bool locked, bool private) { mss->pss += pss; if (folio_test_anon(folio)) mss->pss_anon += pss; else if (folio_test_swapbacked(folio)) mss->pss_shmem += pss; else mss->pss_file += pss; if (locked) mss->pss_locked += pss; if (dirty || folio_test_dirty(folio)) { mss->pss_dirty += pss; if (private) mss->private_dirty += size; else mss->shared_dirty += size; } else { if (private) mss->private_clean += size; else mss->shared_clean += size; } } static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked, bool present) { struct folio *folio = page_folio(page); int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; bool exclusive; int mapcount; /* * First accumulate quantities that depend only on |size| and the type * of the compound page. */ if (folio_test_anon(folio)) { mss->anonymous += size; if (!folio_test_swapbacked(folio) && !dirty && !folio_test_dirty(folio)) mss->lazyfree += size; } if (folio_test_ksm(folio)) mss->ksm += size; mss->resident += size; /* Accumulate the size in pages that have been accessed. */ if (young || folio_test_young(folio) || folio_test_referenced(folio)) mss->referenced += size; /* * Then accumulate quantities that may depend on sharing, or that may * differ page-by-page. * * refcount == 1 for present entries guarantees that the folio is mapped * exactly once. For large folios this implies that exactly one * PTE/PMD/... maps (a part of) this folio. * * Treat all non-present entries (where relying on the mapcount and * refcount doesn't make sense) as "maybe shared, but not sure how * often". We treat device private entries as being fake-present. * * Note that it would not be safe to read the mapcount especially for * pages referenced by migration entries, even with the PTL held. */ if (folio_ref_count(folio) == 1 || !present) { smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT, dirty, locked, present); return; } if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { mapcount = folio_average_page_mapcount(folio); exclusive = !folio_maybe_mapped_shared(folio); } /* * We obtain a snapshot of the mapcount. Without holding the folio lock * this snapshot can be slightly wrong as we cannot always read the * mapcount atomically. */ for (i = 0; i < nr; i++, page++) { unsigned long pss = PAGE_SIZE << PSS_SHIFT; if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) { mapcount = folio_precise_page_mapcount(folio, page); exclusive = mapcount < 2; } if (mapcount >= 2) pss /= mapcount; smaps_page_accumulate(mss, folio, PAGE_SIZE, pss, dirty, locked, exclusive); } } #ifdef CONFIG_SHMEM static int smaps_pte_hole(unsigned long addr, unsigned long end, __always_unused int depth, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping, linear_page_index(vma, addr), linear_page_index(vma, end)); return 0; } #else #define smaps_pte_hole NULL #endif /* CONFIG_SHMEM */ static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk) { #ifdef CONFIG_SHMEM if (walk->ops->pte_hole) { /* depth is not used */ smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk); } #endif } static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool present = false, young = false, dirty = false; pte_t ptent = ptep_get(pte); if (pte_present(ptent)) { page = vm_normal_page(vma, addr, ptent); young = pte_young(ptent); dirty = pte_dirty(ptent); present = true; } else if (pte_none(ptent)) { smaps_pte_hole_lookup(addr, walk); } else { const softleaf_t entry = softleaf_from_pte(ptent); if (softleaf_is_swap(entry)) { int mapcount; mss->swap += PAGE_SIZE; mapcount = swp_swapcount(entry); if (mapcount >= 2) { u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; do_div(pss_delta, mapcount); mss->swap_pss += pss_delta; } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } } else if (softleaf_has_pfn(entry)) { if (softleaf_is_device_private(entry)) present = true; page = softleaf_to_page(entry); } } if (!page) return; smaps_account(mss, page, false, young, dirty, locked, present); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool present = false; struct folio *folio; if (pmd_none(*pmd)) return; if (pmd_present(*pmd)) { page = vm_normal_page_pmd(vma, addr, *pmd); present = true; } else if (unlikely(thp_migration_supported())) { const softleaf_t entry = softleaf_from_pmd(*pmd); if (softleaf_has_pfn(entry)) page = softleaf_to_page(entry); } if (IS_ERR_OR_NULL(page)) return; folio = page_folio(page); if (folio_test_anon(folio)) mss->anonymous_thp += HPAGE_PMD_SIZE; else if (folio_test_swapbacked(folio)) mss->shmem_thp += HPAGE_PMD_SIZE; else if (folio_is_zone_device(folio)) /* pass */; else mss->file_thp += HPAGE_PMD_SIZE; smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked, present); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mm_walk *walk) { } #endif static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; pte_t *pte; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); goto out; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) smaps_pte_entry(pte, addr, walk); pte_unmap_unlock(pte - 1, ptl); out: cond_resched(); return 0; } static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) { /* * Don't forget to update Documentation/ on changes. * * The length of the second argument of mnemonics[] * needs to be 3 instead of previously set 2 * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3]) * to avoid spurious * -Werror=unterminated-string-initialization warning * with GCC 15 */ static const char mnemonics[BITS_PER_LONG][3] = { /* * In case if we meet a flag we don't know about. */ [0 ... (BITS_PER_LONG-1)] = "??", [ilog2(VM_READ)] = "rd", [ilog2(VM_WRITE)] = "wr", [ilog2(VM_EXEC)] = "ex", [ilog2(VM_SHARED)] = "sh", [ilog2(VM_MAYREAD)] = "mr", [ilog2(VM_MAYWRITE)] = "mw", [ilog2(VM_MAYEXEC)] = "me", [ilog2(VM_MAYSHARE)] = "ms", [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", [ilog2(VM_MAYBE_GUARD)] = "gu", [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", [ilog2(VM_RAND_READ)] = "rr", [ilog2(VM_DONTCOPY)] = "dc", [ilog2(VM_DONTEXPAND)] = "de", [ilog2(VM_LOCKONFAULT)] = "lf", [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", [ilog2(VM_SYNC)] = "sf", [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", #ifdef CONFIG_ARM64_BTI [ilog2(VM_ARM64_BTI)] = "bt", #endif #ifdef CONFIG_MEM_SOFT_DIRTY [ilog2(VM_SOFTDIRTY)] = "sd", #endif [ilog2(VM_MIXEDMAP)] = "mm", [ilog2(VM_HUGEPAGE)] = "hg", [ilog2(VM_NOHUGEPAGE)] = "nh", [ilog2(VM_MERGEABLE)] = "mg", [ilog2(VM_UFFD_MISSING)]= "um", [ilog2(VM_UFFD_WP)] = "uw", #ifdef CONFIG_ARM64_MTE [ilog2(VM_MTE)] = "mt", [ilog2(VM_MTE_ALLOWED)] = "", #endif #ifdef CONFIG_ARCH_HAS_PKEYS /* These come out via ProtectionKey: */ [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", #if CONFIG_ARCH_PKEY_BITS > 3 [ilog2(VM_PKEY_BIT3)] = "", #endif #if CONFIG_ARCH_PKEY_BITS > 4 [ilog2(VM_PKEY_BIT4)] = "", #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ #ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) [ilog2(VM_DROPPABLE)] = "dp", #endif #ifdef CONFIG_64BIT [ilog2(VM_SEALED)] = "sl", #endif }; size_t i; seq_puts(m, "VmFlags: "); for (i = 0; i < BITS_PER_LONG; i++) { if (!mnemonics[i][0]) continue; if (vma->vm_flags & (1UL << i)) seq_printf(m, "%s ", mnemonics[i]); } seq_putc(m, '\n'); } #ifdef CONFIG_HUGETLB_PAGE static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; struct folio *folio = NULL; bool present = false; spinlock_t *ptl; pte_t ptent; ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); ptent = huge_ptep_get(walk->mm, addr, pte); if (pte_present(ptent)) { folio = page_folio(pte_page(ptent)); present = true; } else { const softleaf_t entry = softleaf_from_pte(ptent); if (softleaf_has_pfn(entry)) folio = softleaf_to_folio(entry); } if (folio) { /* We treat non-present entries as "maybe shared". */ if (!present || folio_maybe_mapped_shared(folio) || hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); } spin_unlock(ptl); return 0; } #else #define smaps_hugetlb_range NULL #endif /* HUGETLB_PAGE */ static const struct mm_walk_ops smaps_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, .walk_lock = PGWALK_RDLOCK, }; static const struct mm_walk_ops smaps_shmem_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, .pte_hole = smaps_pte_hole, .walk_lock = PGWALK_RDLOCK, }; /* * Gather mem stats from @vma with the indicated beginning * address @start, and keep them in @mss. * * Use vm_start of @vma as the beginning address if @start is 0. */ static void smap_gather_stats(struct vm_area_struct *vma, struct mem_size_stats *mss, unsigned long start) { const struct mm_walk_ops *ops = &smaps_walk_ops; /* Invalid start */ if (start >= vma->vm_end) return; if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all * swapped out pages belong to the shmem object, and we can * obtain the swap value much more efficiently. For private * writable mappings, we might have COW pages that are * not affected by the parent swapped out pages of the shmem * object, so we have to distinguish them during the page walk. * Unless we know that the shmem object (or the part mapped by * our VMA) has no swapped out pages at all. */ unsigned long shmem_swapped = shmem_swap_usage(vma); if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) || !(vma->vm_flags & VM_WRITE))) { mss->swap += shmem_swapped; } else { ops = &smaps_shmem_walk_ops; } } /* mmap_lock is held in m_start */ if (!start) walk_page_vma(vma, ops, mss); else walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); } #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) >> 10, 8) /* Show the contents common for smaps and smaps_rollup */ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, bool rollup_mode) { SEQ_PUT_DEC("Rss: ", mss->resident); SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT); if (rollup_mode) { /* * These are meaningful only for smaps_rollup, otherwise two of * them are zero, and the other one is the same as Pss. */ SEQ_PUT_DEC(" kB\nPss_Anon: ", mss->pss_anon >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nPss_File: ", mss->pss_file >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nPss_Shmem: ", mss->pss_shmem >> PSS_SHIFT); } SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm); SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", mss->private_hugetlb >> 10, 7); SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); SEQ_PUT_DEC(" kB\nSwapPss: ", mss->swap_pss >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nLocked: ", mss->pss_locked >> PSS_SHIFT); seq_puts(m, " kB\n"); } static int show_smap(struct seq_file *m, void *v) { struct vm_area_struct *vma = v; struct mem_size_stats mss = {}; smap_gather_stats(vma, &mss, 0); show_map_vma(m, vma); SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); seq_puts(m, " kB\n"); __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS, THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); show_smap_vma_flags(m, vma); return 0; } static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mem_size_stats mss = {}; struct mm_struct *mm = priv->lock_ctx.mm; struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; VMA_ITERATOR(vmi, mm, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) return -ESRCH; if (!mm || !mmget_not_zero(mm)) { ret = -ESRCH; goto out_put_task; } ret = mmap_read_lock_killable(mm); if (ret) goto out_put_mm; hold_task_mempolicy(priv); vma = vma_next(&vmi); if (unlikely(!vma)) goto empty_set; vma_start = vma->vm_start; do { smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; /* * Release mmap_lock temporarily if someone wants to * access it for write request. */ if (mmap_lock_is_contended(mm)) { vma_iter_invalidate(&vmi); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { release_task_mempolicy(priv); goto out_put_mm; } /* * After dropping the lock, there are four cases to * consider. See the following example for explanation. * * +------+------+-----------+ * | VMA1 | VMA2 | VMA3 | * +------+------+-----------+ * | | | | * 4k 8k 16k 400k * * Suppose we drop the lock after reading VMA2 due to * contention, then we get: * * last_vma_end = 16k * * 1) VMA2 is freed, but VMA3 exists: * * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 2) VMA2 still exists: * * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 3) No more VMAs can be found: * * vma_next(vmi) will return NULL. * No more things to do, just break. * * 4) (last_vma_end - 1) is the middle of a vma (VMA'): * * vma_next(vmi) will return VMA' whose range * contains last_vma_end. * Iterate VMA' from last_vma_end. */ vma = vma_next(&vmi); /* Case 3 above */ if (!vma) break; /* Case 1 and 2 above */ if (vma->vm_start >= last_vma_end) { smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; continue; } /* Case 4 above */ if (vma->vm_end > last_vma_end) { smap_gather_stats(vma, &mss, last_vma_end); last_vma_end = vma->vm_end; } } } for_each_vma(vmi, vma); empty_set: show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); seq_pad(m, ' '); seq_puts(m, "[rollup]\n"); __show_smap(m, &mss, true); release_task_mempolicy(priv); mmap_read_unlock(mm); out_put_mm: mmput(mm); out_put_task: put_task_struct(priv->task); priv->task = NULL; return ret; } #undef SEQ_PUT_DEC static const struct seq_operations proc_pid_smaps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_smap }; static int pid_smaps_open(struct inode *inode, struct file *file) { return do_maps_open(inode, file, &proc_pid_smaps_op); } static int smaps_rollup_open(struct inode *inode, struct file *file) { int ret; struct proc_maps_private *priv; priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT); if (!priv) return -ENOMEM; ret = single_open(file, show_smaps_rollup, priv); if (ret) goto out_free; priv->inode = inode; priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) { ret = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH; single_release(inode, file); goto out_free; } return 0; out_free: kfree(priv); return ret; } static int smaps_rollup_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; if (priv->lock_ctx.mm) mmdrop(priv->lock_ctx.mm); kfree(priv); return single_release(inode, file); } const struct file_operations proc_pid_smaps_operations = { .open = pid_smaps_open, .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, }; const struct file_operations proc_pid_smaps_rollup_operations = { .open = smaps_rollup_open, .read = seq_read, .llseek = seq_lseek, .release = smaps_rollup_release, }; enum clear_refs_types { CLEAR_REFS_ALL = 1, CLEAR_REFS_ANON, CLEAR_REFS_MAPPED, CLEAR_REFS_SOFT_DIRTY, CLEAR_REFS_MM_HIWATER_RSS, CLEAR_REFS_LAST, }; struct clear_refs_private { enum clear_refs_types type; }; static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { struct folio *folio; if (!pte_write(pte)) return false; if (!is_cow_mapping(vma->vm_flags)) return false; if (likely(!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm))) return false; folio = vm_normal_folio(vma, addr, pte); if (!folio) return false; return folio_maybe_dma_pinned(folio); } static inline void clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { if (!pgtable_supports_soft_dirty()) return; /* * The soft-dirty tracker uses #PF-s to catch writes * to pages, so write-protect the pte as well. See the * Documentation/admin-guide/mm/soft-dirty.rst for full description * of how soft-dirty works. */ pte_t ptent = ptep_get(pte); if (pte_none(ptent)) return; if (pte_present(ptent)) { pte_t old_pte; if (pte_is_pinned(vma, addr, ptent)) return; old_pte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else { ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; if (!pgtable_supports_soft_dirty()) return; if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ old = pmdp_invalidate(vma, addr, pmdp); if (pmd_dirty(old)) pmd = pmd_mkdirty(pmd); if (pmd_young(old)) pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } else if (pmd_is_migration_entry(pmd)) { pmd = pmd_swp_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } } #else static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { } #endif static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct clear_refs_private *cp = walk->private; struct vm_area_struct *vma = walk->vma; pte_t *pte, ptent; spinlock_t *ptl; struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty_pmd(vma, addr, pmd); goto out; } if (!pmd_present(*pmd)) goto out; folio = pmd_folio(*pmd); /* Clear accessed and referenced bits. */ pmdp_test_and_clear_young(vma, addr, pmd); folio_test_clear_young(folio); folio_clear_referenced(folio); out: spin_unlock(ptl); return 0; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); continue; } if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio) continue; /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); folio_test_clear_young(folio); folio_clear_referenced(folio); } pte_unmap_unlock(pte - 1, ptl); cond_resched(); return 0; } static int clear_refs_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct clear_refs_private *cp = walk->private; struct vm_area_struct *vma = walk->vma; if (vma->vm_flags & VM_PFNMAP) return 1; /* * Writing 1 to /proc/pid/clear_refs affects all pages. * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. * Writing 3 to /proc/pid/clear_refs only affects file mapped pages. * Writing 4 to /proc/pid/clear_refs affects all pages. */ if (cp->type == CLEAR_REFS_ANON && vma->vm_file) return 1; if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file) return 1; return 0; } static const struct mm_walk_ops clear_refs_walk_ops = { .pmd_entry = clear_refs_pte_range, .test_walk = clear_refs_test_walk, .walk_lock = PGWALK_WRLOCK, }; static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; char buffer[PROC_NUMBUF] = {}; struct mm_struct *mm; struct vm_area_struct *vma; enum clear_refs_types type; int itype; int rv; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; rv = kstrtoint(strstrip(buffer), 10, &itype); if (rv < 0) return rv; type = (enum clear_refs_types)itype; if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mm = get_task_mm(task); if (mm) { VMA_ITERATOR(vmi, mm, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, }; if (mmap_write_lock_killable(mm)) { count = -EINTR; goto out_mm; } if (type == CLEAR_REFS_MM_HIWATER_RSS) { /* * Writing 5 to /proc/pid/clear_refs resets the peak * resident set size to this mm's current rss value. */ reset_mm_hiwater_rss(mm); goto out_unlock; } if (type == CLEAR_REFS_SOFT_DIRTY) { for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; vm_flags_clear(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } inc_tlb_flush_pending(mm); mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 0, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); if (type == CLEAR_REFS_SOFT_DIRTY) { mmu_notifier_invalidate_range_end(&range); flush_tlb_mm(mm); dec_tlb_flush_pending(mm); } out_unlock: mmap_write_unlock(mm); out_mm: mmput(mm); } put_task_struct(task); return count; } const struct file_operations proc_clear_refs_operations = { .write = clear_refs_write, .llseek = noop_llseek, }; typedef struct { u64 pme; } pagemap_entry_t; struct pagemapread { int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; bool show_pfn; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) #define PAGEMAP_WALK_MASK (PMD_MASK) #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) #define PM_PFRAME_BITS 55 #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) #define PM_SOFT_DIRTY BIT_ULL(55) #define PM_MMAP_EXCLUSIVE BIT_ULL(56) #define PM_UFFD_WP BIT_ULL(57) #define PM_GUARD_REGION BIT_ULL(58) #define PM_FILE BIT_ULL(61) #define PM_SWAP BIT_ULL(62) #define PM_PRESENT BIT_ULL(63) #define PM_END_OF_BUFFER 1 static inline pagemap_entry_t make_pme(u64 frame, u64 flags) { return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; } static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) { pm->buffer[pm->pos++] = *pme; if (pm->pos >= pm->len) return PM_END_OF_BUFFER; return 0; } static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page) { if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) return folio_precise_page_mapcount(folio, page) == 1; return !folio_maybe_mapped_shared(folio); } static int pagemap_pte_hole(unsigned long start, unsigned long end, __always_unused int depth, struct mm_walk *walk) { struct pagemapread *pm = walk->private; unsigned long addr = start; int err = 0; while (addr < end) { struct vm_area_struct *vma = find_vma(walk->mm, addr); pagemap_entry_t pme = make_pme(0, 0); /* End of address space hole, which we mark as non-present. */ unsigned long hole_end; if (vma) hole_end = min(end, vma->vm_start); else hole_end = end; for (; addr < hole_end; addr += PAGE_SIZE) { err = add_to_pagemap(&pme, pm); if (err) goto out; } if (!vma) break; /* Addresses in the VMA. */ if (vma->vm_flags & VM_SOFTDIRTY) pme = make_pme(0, PM_SOFT_DIRTY); for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { err = add_to_pagemap(&pme, pm); if (err) goto out; } } out: return err; } static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { u64 frame = 0, flags = 0; struct page *page = NULL; struct folio *folio; if (pte_none(pte)) goto out; if (pte_present(pte)) { if (pm->show_pfn) frame = pte_pfn(pte); flags |= PM_PRESENT; page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; if (pte_uffd_wp(pte)) flags |= PM_UFFD_WP; } else { softleaf_t entry; if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; if (pte_swp_uffd_wp(pte)) flags |= PM_UFFD_WP; entry = softleaf_from_pte(pte); if (pm->show_pfn) { pgoff_t offset; /* * For PFN swap offsets, keeping the offset field * to be PFN only to be compatible with old smaps. */ if (softleaf_has_pfn(entry)) offset = softleaf_to_pfn(entry); else offset = swp_offset(entry); frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); } flags |= PM_SWAP; if (softleaf_has_pfn(entry)) page = softleaf_to_page(entry); if (softleaf_is_uffd_wp_marker(entry)) flags |= PM_UFFD_WP; if (softleaf_is_guard_marker(entry)) flags |= PM_GUARD_REGION; } if (page) { folio = page_folio(page); if (!folio_test_anon(folio)) flags |= PM_FILE; if ((flags & PM_PRESENT) && __folio_page_mapped_exclusively(folio, page)) flags |= PM_MMAP_EXCLUSIVE; } out: if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; return make_pme(frame, flags); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, unsigned long end, struct vm_area_struct *vma, struct pagemapread *pm) { unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT; u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; struct page *page = NULL; struct folio *folio = NULL; int err = 0; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; if (pmd_none(pmd)) goto populate_pagemap; if (pmd_present(pmd)) { page = pmd_page(pmd); flags |= PM_PRESENT; if (pmd_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; if (pmd_uffd_wp(pmd)) flags |= PM_UFFD_WP; if (pm->show_pfn) frame = pmd_pfn(pmd) + idx; } else if (thp_migration_supported()) { const softleaf_t entry = softleaf_from_pmd(pmd); unsigned long offset; if (pm->show_pfn) { if (softleaf_has_pfn(entry)) offset = softleaf_to_pfn(entry) + idx; else offset = swp_offset(entry) + idx; frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); } flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd)); page = softleaf_to_page(entry); } if (page) { folio = page_folio(page); if (!folio_test_anon(folio)) flags |= PM_FILE; } populate_pagemap: for (; addr != end; addr += PAGE_SIZE, idx++) { u64 cur_flags = flags; pagemap_entry_t pme; if (folio && (flags & PM_PRESENT) && __folio_page_mapped_exclusively(folio, page)) cur_flags |= PM_MMAP_EXCLUSIVE; pme = make_pme(frame, cur_flags); err = add_to_pagemap(&pme, pm); if (err) break; if (pm->show_pfn) { if (flags & PM_PRESENT) frame++; else if (flags & PM_SWAP) frame += (1 << MAX_SWAPFILES_SHIFT); } } return err; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct pagemapread *pm = walk->private; spinlock_t *ptl; pte_t *pte, *orig_pte; int err = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptl = pmd_trans_huge_lock(pmdp, vma); if (ptl) { err = pagemap_pmd_range_thp(pmdp, addr, end, vma, pm); spin_unlock(ptl); return err; } #endif /* * We can assume that @vma always points to a valid one and @end never * goes beyond vma->vm_end. */ orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return err; } for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); err = add_to_pagemap(&pme, pm); if (err) break; } pte_unmap_unlock(orig_pte, ptl); cond_resched(); return err; } #ifdef CONFIG_HUGETLB_PAGE /* This function walks within one hugetlb entry in the single call */ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct pagemapread *pm = walk->private; struct vm_area_struct *vma = walk->vma; u64 flags = 0, frame = 0; spinlock_t *ptl; int err = 0; pte_t pte; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep); pte = huge_ptep_get(walk->mm, addr, ptep); if (pte_present(pte)) { struct folio *folio = page_folio(pte_page(pte)); if (!folio_test_anon(folio)) flags |= PM_FILE; if (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(ptep)) flags |= PM_MMAP_EXCLUSIVE; if (huge_pte_uffd_wp(pte)) flags |= PM_UFFD_WP; flags |= PM_PRESENT; if (pm->show_pfn) frame = pte_pfn(pte) + ((addr & ~hmask) >> PAGE_SHIFT); } else if (pte_swp_uffd_wp_any(pte)) { flags |= PM_UFFD_WP; } for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); err = add_to_pagemap(&pme, pm); if (err) break; if (pm->show_pfn && (flags & PM_PRESENT)) frame++; } spin_unlock(ptl); cond_resched(); return err; } #else #define pagemap_hugetlb_range NULL #endif /* HUGETLB_PAGE */ static const struct mm_walk_ops pagemap_ops = { .pmd_entry = pagemap_pmd_range, .pte_hole = pagemap_pte_hole, .hugetlb_entry = pagemap_hugetlb_range, .walk_lock = PGWALK_RDLOCK, }; /* * /proc/pid/pagemap - an array mapping virtual pages to pfns * * For each page in the address space, this file contains one 64-bit entry * consisting of the following: * * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) * Bit 56 page exclusively mapped * Bit 57 pte is uffd-wp write-protected * Bit 58 pte is a guard region * Bits 59-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present * * If the page is not present but in swap, then the PFN contains an * encoding of the swap file number and the page's offset into the * swap. Unmapped pages return a null PFN. This allows determining * precisely which pages are mapped (or in swap) and comparing mapped * pages between processes. * * Efficient users of this interface will use /proc/pid/maps to * determine which areas of memory are actually mapped and llseek to * skip over unmapped regions. */ static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct mm_struct *mm = file->private_data; struct pagemapread pm; unsigned long src; unsigned long svpfn; unsigned long start_vaddr; unsigned long end_vaddr; int ret = 0, copied = 0; if (!mm || !mmget_not_zero(mm)) goto out; ret = -EINVAL; /* file position must be aligned */ if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) goto out_mm; ret = 0; if (!count) goto out_mm; /* do not disclose physical addresses: attack vector */ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); ret = -ENOMEM; if (!pm.buffer) goto out_mm; src = *ppos; svpfn = src / PM_ENTRY_BYTES; end_vaddr = mm->task_size; /* watch out for wraparound */ start_vaddr = end_vaddr; if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { unsigned long end; ret = mmap_read_lock_killable(mm); if (ret) goto out_free; start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); mmap_read_unlock(mm); end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT); if (end >= start_vaddr && end < mm->task_size) end_vaddr = end; } /* Ensure the address is inside the task */ if (start_vaddr > mm->task_size) start_vaddr = end_vaddr; ret = 0; while (count && (start_vaddr < end_vaddr)) { int len; unsigned long end; pm.pos = 0; end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; ret = mmap_read_lock_killable(mm); if (ret) goto out_free; ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); mmap_read_unlock(mm); start_vaddr = end; len = min(count, PM_ENTRY_BYTES * pm.pos); if (copy_to_user(buf, pm.buffer, len)) { ret = -EFAULT; goto out_free; } copied += len; buf += len; count -= len; } *ppos += copied; if (!ret || ret == PM_END_OF_BUFFER) ret = copied; out_free: kfree(pm.buffer); out_mm: mmput(mm); out: return ret; } static int pagemap_open(struct inode *inode, struct file *file) { struct mm_struct *mm; mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR_OR_NULL(mm)) return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; } static int pagemap_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; if (mm) mmdrop(mm); return 0; } #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ PAGE_IS_FILE | PAGE_IS_PRESENT | \ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \ PAGE_IS_GUARD) #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) struct pagemap_scan_private { struct pm_scan_arg arg; unsigned long masks_of_interest, cur_vma_category; struct page_region *vec_buf; unsigned long vec_buf_len, vec_buf_index, found_pages; struct page_region __user *vec_out; }; static unsigned long pagemap_page_category(struct pagemap_scan_private *p, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { unsigned long categories; if (pte_none(pte)) return 0; if (pte_present(pte)) { struct page *page; categories = PAGE_IS_PRESENT; if (!pte_uffd_wp(pte)) categories |= PAGE_IS_WRITTEN; if (p->masks_of_interest & PAGE_IS_FILE) { page = vm_normal_page(vma, addr, pte); if (page && !PageAnon(page)) categories |= PAGE_IS_FILE; } if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } else { softleaf_t entry; categories = PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; entry = softleaf_from_pte(pte); if (softleaf_is_guard_marker(entry)) categories |= PAGE_IS_GUARD; else if ((p->masks_of_interest & PAGE_IS_FILE) && softleaf_has_pfn(entry) && !folio_test_anon(softleaf_to_folio(entry))) categories |= PAGE_IS_FILE; if (pte_swp_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } return categories; } static void make_uffd_wp_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t ptent) { if (pte_present(ptent)) { pte_t old_pte; old_pte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_mkuffd_wp(old_pte); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (pte_none(ptent)) { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); } else { ptent = pte_swp_mkuffd_wp(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { unsigned long categories = PAGE_IS_HUGE; if (pmd_none(pmd)) return categories; if (pmd_present(pmd)) { struct page *page; categories |= PAGE_IS_PRESENT; if (!pmd_uffd_wp(pmd)) categories |= PAGE_IS_WRITTEN; if (p->masks_of_interest & PAGE_IS_FILE) { page = vm_normal_page_pmd(vma, addr, pmd); if (page && !PageAnon(page)) categories |= PAGE_IS_FILE; } if (is_huge_zero_pmd(pmd)) categories |= PAGE_IS_PFNZERO; if (pmd_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; } else { categories |= PAGE_IS_SWAPPED; if (!pmd_swp_uffd_wp(pmd)) categories |= PAGE_IS_WRITTEN; if (pmd_swp_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; if (p->masks_of_interest & PAGE_IS_FILE) { const softleaf_t entry = softleaf_from_pmd(pmd); if (softleaf_has_pfn(entry) && !folio_test_anon(softleaf_to_folio(entry))) categories |= PAGE_IS_FILE; } } return categories; } static void make_uffd_wp_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; if (pmd_present(pmd)) { old = pmdp_invalidate_ad(vma, addr, pmdp); pmd = pmd_mkuffd_wp(old); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } else if (pmd_is_migration_entry(pmd)) { pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_HUGETLB_PAGE static unsigned long pagemap_hugetlb_category(pte_t pte) { unsigned long categories = PAGE_IS_HUGE; if (pte_none(pte)) return categories; /* * According to pagemap_hugetlb_range(), file-backed HugeTLB * page cannot be swapped. So PAGE_IS_FILE is not checked for * swapped pages. */ if (pte_present(pte)) { categories |= PAGE_IS_PRESENT; if (!huge_pte_uffd_wp(pte)) categories |= PAGE_IS_WRITTEN; if (!PageAnon(pte_page(pte))) categories |= PAGE_IS_FILE; if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } else { categories |= PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; if (pte_swp_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } return categories; } static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t ptent) { const unsigned long psize = huge_page_size(hstate_vma(vma)); softleaf_t entry; if (huge_pte_none(ptent)) { set_huge_pte_at(vma->vm_mm, addr, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), psize); return; } entry = softleaf_from_pte(ptent); if (softleaf_is_hwpoison(entry) || softleaf_is_marker(entry)) return; if (softleaf_is_migration(entry)) set_huge_pte_at(vma->vm_mm, addr, ptep, pte_swp_mkuffd_wp(ptent), psize); else huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, huge_pte_mkuffd_wp(ptent)); } #endif /* CONFIG_HUGETLB_PAGE */ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) static void pagemap_scan_backout_range(struct pagemap_scan_private *p, unsigned long addr, unsigned long end) { struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; if (!p->vec_buf) return; if (cur_buf->start != addr) cur_buf->end = addr; else cur_buf->start = cur_buf->end = 0; p->found_pages -= (end - addr) / PAGE_SIZE; } #endif static bool pagemap_scan_is_interesting_page(unsigned long categories, const struct pagemap_scan_private *p) { categories ^= p->arg.category_inverted; if ((categories & p->arg.category_mask) != p->arg.category_mask) return false; if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) return false; return true; } static bool pagemap_scan_is_interesting_vma(unsigned long categories, const struct pagemap_scan_private *p) { unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; categories ^= p->arg.category_inverted; if ((categories & required) != required) return false; return true; } static int pagemap_scan_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long vma_category = 0; bool wp_allowed = userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma); if (!wp_allowed) { /* User requested explicit failure over wp-async capability */ if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) return -EPERM; /* * User requires wr-protect, and allows silently skipping * unsupported vmas. */ if (p->arg.flags & PM_SCAN_WP_MATCHING) return 1; /* * Then the request doesn't involve wr-protects at all, * fall through to the rest checks, and allow vma walk. */ } if (vma->vm_flags & VM_PFNMAP) return 1; if (wp_allowed) vma_category |= PAGE_IS_WPALLOWED; if (vma->vm_flags & VM_SOFTDIRTY) vma_category |= PAGE_IS_SOFT_DIRTY; if (!pagemap_scan_is_interesting_vma(vma_category, p)) return 1; p->cur_vma_category = vma_category; return 0; } static bool pagemap_scan_push_range(unsigned long categories, struct pagemap_scan_private *p, unsigned long addr, unsigned long end) { struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; /* * When there is no output buffer provided at all, the sentinel values * won't match here. There is no other way for `cur_buf->end` to be * non-zero other than it being non-empty. */ if (addr == cur_buf->end && categories == cur_buf->categories) { cur_buf->end = end; return true; } if (cur_buf->end) { if (p->vec_buf_index >= p->vec_buf_len - 1) return false; cur_buf = &p->vec_buf[++p->vec_buf_index]; } cur_buf->start = addr; cur_buf->end = end; cur_buf->categories = categories; return true; } static int pagemap_scan_output(unsigned long categories, struct pagemap_scan_private *p, unsigned long addr, unsigned long *end) { unsigned long n_pages, total_pages; int ret = 0; if (!p->vec_buf) return 0; categories &= p->arg.return_mask; n_pages = (*end - addr) / PAGE_SIZE; if (check_add_overflow(p->found_pages, n_pages, &total_pages) || total_pages > p->arg.max_pages) { size_t n_too_much = total_pages - p->arg.max_pages; *end -= n_too_much * PAGE_SIZE; n_pages -= n_too_much; ret = -ENOSPC; } if (!pagemap_scan_push_range(categories, p, addr, *end)) { *end = addr; n_pages = 0; ret = -ENOSPC; } p->found_pages += n_pages; if (ret) p->arg.walk_end = *end; return ret; } static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long categories; spinlock_t *ptl; int ret = 0; ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) return -ENOENT; categories = p->cur_vma_category | pagemap_thp_category(p, vma, start, *pmd); if (!pagemap_scan_is_interesting_page(categories, p)) goto out_unlock; ret = pagemap_scan_output(categories, p, start, &end); if (start == end) goto out_unlock; if (~p->arg.flags & PM_SCAN_WP_MATCHING) goto out_unlock; if (~categories & PAGE_IS_WRITTEN) goto out_unlock; /* * Break huge page into small pages if the WP operation * needs to be performed on a portion of the huge page. */ if (end != start + HPAGE_SIZE) { spin_unlock(ptl); split_huge_pmd(vma, pmd, start); pagemap_scan_backout_range(p, start, end); /* Report as if there was no THP */ return -ENOENT; } make_uffd_wp_pmd(vma, start, pmd); flush_tlb_range(vma, start, end); out_unlock: spin_unlock(ptl); return ret; #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ return -ENOENT; #endif } static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long addr, flush_end = 0; pte_t *pte, *start_pte; spinlock_t *ptl; int ret; ret = pagemap_scan_thp_entry(pmd, start, end, walk); if (ret != -ENOENT) return ret; ret = 0; start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } arch_enter_lazy_mmu_mode(); if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { /* Fast path for performing exclusive WP */ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { pte_t ptent = ptep_get(pte); if ((pte_present(ptent) && pte_uffd_wp(ptent)) || pte_swp_uffd_wp_any(ptent)) continue; make_uffd_wp_pte(vma, addr, pte, ptent); if (!flush_end) start = addr; flush_end = addr + PAGE_SIZE; } goto flush_and_return; } if (!p->arg.category_anyof_mask && !p->arg.category_inverted && p->arg.category_mask == PAGE_IS_WRITTEN && p->arg.return_mask == PAGE_IS_WRITTEN) { for (addr = start; addr < end; pte++, addr += PAGE_SIZE) { unsigned long next = addr + PAGE_SIZE; pte_t ptent = ptep_get(pte); if ((pte_present(ptent) && pte_uffd_wp(ptent)) || pte_swp_uffd_wp_any(ptent)) continue; ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN, p, addr, &next); if (next == addr) break; if (~p->arg.flags & PM_SCAN_WP_MATCHING) continue; make_uffd_wp_pte(vma, addr, pte, ptent); if (!flush_end) start = addr; flush_end = next; } goto flush_and_return; } for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { pte_t ptent = ptep_get(pte); unsigned long categories = p->cur_vma_category | pagemap_page_category(p, vma, addr, ptent); unsigned long next = addr + PAGE_SIZE; if (!pagemap_scan_is_interesting_page(categories, p)) continue; ret = pagemap_scan_output(categories, p, addr, &next); if (next == addr) break; if (~p->arg.flags & PM_SCAN_WP_MATCHING) continue; if (~categories & PAGE_IS_WRITTEN) continue; make_uffd_wp_pte(vma, addr, pte, ptent); if (!flush_end) start = addr; flush_end = next; } flush_and_return: if (flush_end) flush_tlb_range(vma, start, addr); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); cond_resched(); return ret; } #ifdef CONFIG_HUGETLB_PAGE static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long categories; spinlock_t *ptl; int ret = 0; pte_t pte; if (~p->arg.flags & PM_SCAN_WP_MATCHING) { /* Go the short route when not write-protecting pages. */ pte = huge_ptep_get(walk->mm, start, ptep); categories = p->cur_vma_category | pagemap_hugetlb_category(pte); if (!pagemap_scan_is_interesting_page(categories, p)) return 0; return pagemap_scan_output(categories, p, start, &end); } i_mmap_lock_write(vma->vm_file->f_mapping); ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); pte = huge_ptep_get(walk->mm, start, ptep); categories = p->cur_vma_category | pagemap_hugetlb_category(pte); if (!pagemap_scan_is_interesting_page(categories, p)) goto out_unlock; ret = pagemap_scan_output(categories, p, start, &end); if (start == end) goto out_unlock; if (~categories & PAGE_IS_WRITTEN) goto out_unlock; if (end != start + HPAGE_SIZE) { /* Partial HugeTLB page WP isn't possible. */ pagemap_scan_backout_range(p, start, end); p->arg.walk_end = start; ret = 0; goto out_unlock; } make_uffd_wp_huge_pte(vma, start, ptep, pte); flush_hugetlb_tlb_range(vma, start, end); out_unlock: spin_unlock(ptl); i_mmap_unlock_write(vma->vm_file->f_mapping); return ret; } #else #define pagemap_scan_hugetlb_entry NULL #endif static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, int depth, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; int ret, err; if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p)) return 0; ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); if (addr == end) return ret; if (~p->arg.flags & PM_SCAN_WP_MATCHING) return ret; err = uffd_wp_range(vma, addr, end - addr, true); if (err < 0) ret = err; return ret; } static const struct mm_walk_ops pagemap_scan_ops = { .test_walk = pagemap_scan_test_walk, .pmd_entry = pagemap_scan_pmd_entry, .pte_hole = pagemap_scan_pte_hole, .hugetlb_entry = pagemap_scan_hugetlb_entry, }; static int pagemap_scan_get_args(struct pm_scan_arg *arg, unsigned long uarg) { if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) return -EFAULT; if (arg->size != sizeof(struct pm_scan_arg)) return -EINVAL; /* Validate requested features */ if (arg->flags & ~PM_SCAN_FLAGS) return -EINVAL; if ((arg->category_inverted | arg->category_mask | arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) return -EINVAL; arg->start = untagged_addr((unsigned long)arg->start); arg->end = untagged_addr((unsigned long)arg->end); arg->vec = untagged_addr((unsigned long)arg->vec); /* Validate memory pointers */ if (!IS_ALIGNED(arg->start, PAGE_SIZE)) return -EINVAL; if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) return -EFAULT; if (!arg->vec && arg->vec_len) return -EINVAL; if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX) return -EINVAL; if (arg->vec && !access_ok((void __user *)(long)arg->vec, size_mul(arg->vec_len, sizeof(struct page_region)))) return -EFAULT; /* Fixup default values */ arg->end = ALIGN(arg->end, PAGE_SIZE); arg->walk_end = 0; if (!arg->max_pages) arg->max_pages = ULONG_MAX; return 0; } static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, unsigned long uargl) { struct pm_scan_arg __user *uarg = (void __user *)uargl; if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) return -EFAULT; return 0; } static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) { if (!p->arg.vec_len) return 0; p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, p->arg.vec_len); p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), GFP_KERNEL); if (!p->vec_buf) return -ENOMEM; p->vec_buf->start = p->vec_buf->end = 0; p->vec_out = (struct page_region __user *)(long)p->arg.vec; return 0; } static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) { const struct page_region *buf = p->vec_buf; long n = p->vec_buf_index; if (!p->vec_buf) return 0; if (buf[n].end != buf[n].start) n++; if (!n) return 0; if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) return -EFAULT; p->arg.vec_len -= n; p->vec_out += n; p->vec_buf_index = 0; p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); p->vec_buf->start = p->vec_buf->end = 0; return n; } static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) { struct pagemap_scan_private p = {0}; unsigned long walk_start; size_t n_ranges_out = 0; int ret; ret = pagemap_scan_get_args(&p.arg, uarg); if (ret) return ret; p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | p.arg.return_mask; ret = pagemap_scan_init_bounce_buffer(&p); if (ret) return ret; for (walk_start = p.arg.start; walk_start < p.arg.end; walk_start = p.arg.walk_end) { struct mmu_notifier_range range; long n_out; if (fatal_signal_pending(current)) { ret = -EINTR; break; } ret = mmap_read_lock_killable(mm); if (ret) break; /* Protection change for the range is going to happen. */ if (p.arg.flags & PM_SCAN_WP_MATCHING) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, mm, walk_start, p.arg.end); mmu_notifier_invalidate_range_start(&range); } ret = walk_page_range(mm, walk_start, p.arg.end, &pagemap_scan_ops, &p); if (p.arg.flags & PM_SCAN_WP_MATCHING) mmu_notifier_invalidate_range_end(&range); mmap_read_unlock(mm); n_out = pagemap_scan_flush_buffer(&p); if (n_out < 0) ret = n_out; else n_ranges_out += n_out; if (ret != -ENOSPC) break; if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) break; } /* ENOSPC signifies early stop (buffer full) from the walk. */ if (!ret || ret == -ENOSPC) ret = n_ranges_out; /* The walk_end isn't set when ret is zero */ if (!p.arg.walk_end) p.arg.walk_end = p.arg.end; if (pagemap_scan_writeback_args(&p.arg, uarg)) ret = -EFAULT; kfree(p.vec_buf); return ret; } static long do_pagemap_cmd(struct file *file, unsigned int cmd, unsigned long arg) { struct mm_struct *mm = file->private_data; switch (cmd) { case PAGEMAP_SCAN: return do_pagemap_scan(mm, arg); default: return -EINVAL; } } const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, .release = pagemap_release, .unlocked_ioctl = do_pagemap_cmd, .compat_ioctl = do_pagemap_cmd, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA struct numa_maps { unsigned long pages; unsigned long anon; unsigned long active; unsigned long writeback; unsigned long mapcount_max; unsigned long dirty; unsigned long swapcache; unsigned long node[MAX_NUMNODES]; }; struct numa_maps_private { struct proc_maps_private proc_maps; struct numa_maps md; }; static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, unsigned long nr_pages) { struct folio *folio = page_folio(page); int count; if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) count = folio_precise_page_mapcount(folio, page); else count = folio_average_page_mapcount(folio); md->pages += nr_pages; if (pte_dirty || folio_test_dirty(folio)) md->dirty += nr_pages; if (folio_test_swapcache(folio)) md->swapcache += nr_pages; if (folio_test_active(folio) || folio_test_unevictable(folio)) md->active += nr_pages; if (folio_test_writeback(folio)) md->writeback += nr_pages; if (folio_test_anon(folio)) md->anon += nr_pages; if (count > md->mapcount_max) md->mapcount_max = count; md->node[folio_nid(folio)] += nr_pages; } static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, unsigned long addr) { struct page *page; int nid; if (!pte_present(pte)) return NULL; page = vm_normal_page(vma, addr, pte); if (!page || is_zone_device_page(page)) return NULL; if (PageReserved(page)) return NULL; nid = page_to_nid(page); if (!node_isset(nid, node_states[N_MEMORY])) return NULL; return page; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static struct page *can_gather_numa_stats_pmd(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) { struct page *page; int nid; if (!pmd_present(pmd)) return NULL; page = vm_normal_page_pmd(vma, addr, pmd); if (!page) return NULL; if (PageReserved(page)) return NULL; nid = page_to_nid(page); if (!node_isset(nid, node_states[N_MEMORY])) return NULL; return page; } #endif static int gather_pte_stats(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct numa_maps *md = walk->private; struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *orig_pte; pte_t *pte; #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { struct page *page; page = can_gather_numa_stats_pmd(*pmd, vma, addr); if (page) gather_stats(page, md, pmd_dirty(*pmd), HPAGE_PMD_SIZE/PAGE_SIZE); spin_unlock(ptl); return 0; } #endif orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } do { pte_t ptent = ptep_get(pte); struct page *page = can_gather_numa_stats(ptent, vma, addr); if (!page) continue; gather_stats(page, md, pte_dirty(ptent), 1); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); cond_resched(); return 0; } #ifdef CONFIG_HUGETLB_PAGE static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { pte_t huge_pte; struct numa_maps *md; struct page *page; spinlock_t *ptl; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); huge_pte = huge_ptep_get(walk->mm, addr, pte); if (!pte_present(huge_pte)) goto out; page = pte_page(huge_pte); md = walk->private; gather_stats(page, md, pte_dirty(huge_pte), 1); out: spin_unlock(ptl); return 0; } #else static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { return 0; } #endif static const struct mm_walk_ops show_numa_ops = { .hugetlb_entry = gather_hugetlb_stats, .pmd_entry = gather_pte_stats, .walk_lock = PGWALK_RDLOCK, }; /* * Display pages allocated per node and memory policy via /proc. */ static int show_numa_map(struct seq_file *m, void *v) { struct numa_maps_private *numa_priv = m->private; struct proc_maps_private *proc_priv = &numa_priv->proc_maps; struct vm_area_struct *vma = v; struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; char buffer[64]; struct mempolicy *pol; pgoff_t ilx; int nid; if (!mm) return 0; /* Ensure we start with an empty set of numa_maps statistics. */ memset(md, 0, sizeof(*md)); pol = __get_vma_policy(vma, vma->vm_start, &ilx); if (pol) { mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); } else { mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy); } seq_printf(m, "%08lx %s", vma->vm_start, buffer); if (file) { seq_puts(m, " file="); seq_path(m, file_user_path(file), "\n\t= "); } else if (vma_is_initial_heap(vma)) { seq_puts(m, " heap"); } else if (vma_is_initial_stack(vma)) { seq_puts(m, " stack"); } if (is_vm_hugetlb_page(vma)) seq_puts(m, " huge"); /* mmap_lock is held by m_start */ walk_page_vma(vma, &show_numa_ops, md); if (!md->pages) goto out; if (md->anon) seq_printf(m, " anon=%lu", md->anon); if (md->dirty) seq_printf(m, " dirty=%lu", md->dirty); if (md->pages != md->anon && md->pages != md->dirty) seq_printf(m, " mapped=%lu", md->pages); if (md->mapcount_max > 1) seq_printf(m, " mapmax=%lu", md->mapcount_max); if (md->swapcache) seq_printf(m, " swapcache=%lu", md->swapcache); if (md->active < md->pages && !is_vm_hugetlb_page(vma)) seq_printf(m, " active=%lu", md->active); if (md->writeback) seq_printf(m, " writeback=%lu", md->writeback); for_each_node_state(nid, N_MEMORY) if (md->node[nid]) seq_printf(m, " N%d=%lu", nid, md->node[nid]); seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); out: seq_putc(m, '\n'); return 0; } static const struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_numa_map, }; static int pid_numa_maps_open(struct inode *inode, struct file *file) { return proc_maps_open(inode, file, &proc_pid_numa_maps_op, sizeof(struct numa_maps_private)); } const struct file_operations proc_pid_numa_maps_operations = { .open = pid_numa_maps_open, .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, }; #endif /* CONFIG_NUMA */
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 // SPDX-License-Identifier: GPL-2.0-only #include <linux/types.h> #include <linux/netfilter.h> #include <net/tcp.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_seqadj.h> int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_conn_seqadj *seqadj; struct nf_ct_seqadj *this_way; if (off == 0) return 0; set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); seqadj = nfct_seqadj(ct); this_way = &seqadj->seq[dir]; this_way->offset_before = off; this_way->offset_after = off; return 0; } EXPORT_SYMBOL_GPL(nf_ct_seqadj_init); int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, __be32 seq, s32 off) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_ct_seqadj *this_way; if (off == 0) return 0; if (unlikely(!seqadj)) { WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n"); return 0; } set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); spin_lock_bh(&ct->lock); this_way = &seqadj->seq[dir]; if (this_way->offset_before == this_way->offset_after || before(this_way->correction_pos, ntohl(seq))) { this_way->correction_pos = ntohl(seq); this_way->offset_before = this_way->offset_after; this_way->offset_after += off; } spin_unlock_bh(&ct->lock); return 0; } EXPORT_SYMBOL_GPL(nf_ct_seqadj_set); void nf_ct_tcp_seqadj_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off) { const struct tcphdr *th; if (nf_ct_protonum(ct) != IPPROTO_TCP) return; th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb)); nf_ct_seqadj_set(ct, ctinfo, th->seq, off); } EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set); /* Adjust one found SACK option including checksum correction */ static void nf_ct_sack_block_adjust(struct sk_buff *skb, struct tcphdr *tcph, unsigned int sackoff, unsigned int sackend, struct nf_ct_seqadj *seq) { while (sackoff < sackend) { struct tcp_sack_block_wire *sack; __be32 new_start_seq, new_end_seq; sack = (void *)skb->data + sackoff; if (after(ntohl(sack->start_seq) - seq->offset_before, seq->correction_pos)) new_start_seq = htonl(ntohl(sack->start_seq) - seq->offset_after); else new_start_seq = htonl(ntohl(sack->start_seq) - seq->offset_before); if (after(ntohl(sack->end_seq) - seq->offset_before, seq->correction_pos)) new_end_seq = htonl(ntohl(sack->end_seq) - seq->offset_after); else new_end_seq = htonl(ntohl(sack->end_seq) - seq->offset_before); pr_debug("sack_adjust: start_seq: %u->%u, end_seq: %u->%u\n", ntohl(sack->start_seq), ntohl(new_start_seq), ntohl(sack->end_seq), ntohl(new_end_seq)); inet_proto_csum_replace4(&tcph->check, skb, sack->start_seq, new_start_seq, false); inet_proto_csum_replace4(&tcph->check, skb, sack->end_seq, new_end_seq, false); sack->start_seq = new_start_seq; sack->end_seq = new_end_seq; sackoff += sizeof(*sack); } } /* TCP SACK sequence number adjustment */ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { struct tcphdr *tcph = (void *)skb->data + protoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); unsigned int dir, optoff, optend; optoff = protoff + sizeof(struct tcphdr); optend = protoff + tcph->doff * 4; if (skb_ensure_writable(skb, optend)) return 0; tcph = (void *)skb->data + protoff; dir = CTINFO2DIR(ctinfo); while (optoff < optend) { /* Usually: option, length. */ unsigned char *op = skb->data + optoff; switch (op[0]) { case TCPOPT_EOL: return 1; case TCPOPT_NOP: optoff++; continue; default: /* no partial options */ if (optoff + 1 == optend || optoff + op[1] > optend || op[1] < 2) return 0; if (op[0] == TCPOPT_SACK && op[1] >= 2+TCPOLEN_SACK_PERBLOCK && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) nf_ct_sack_block_adjust(skb, tcph, optoff + 2, optoff+op[1], &seqadj->seq[!dir]); optoff += op[1]; } } return 1; } /* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ int nf_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct tcphdr *tcph; __be32 newseq, newack; s32 seqoff, ackoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way, *other_way; int res = 1; this_way = &seqadj->seq[dir]; other_way = &seqadj->seq[!dir]; if (skb_ensure_writable(skb, protoff + sizeof(*tcph))) return 0; tcph = (void *)skb->data + protoff; spin_lock_bh(&ct->lock); if (after(ntohl(tcph->seq), this_way->correction_pos)) seqoff = this_way->offset_after; else seqoff = this_way->offset_before; newseq = htonl(ntohl(tcph->seq) + seqoff); inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false); pr_debug("Adjusting sequence number from %u->%u\n", ntohl(tcph->seq), ntohl(newseq)); tcph->seq = newseq; if (!tcph->ack) goto out; if (after(ntohl(tcph->ack_seq) - other_way->offset_before, other_way->correction_pos)) ackoff = other_way->offset_after; else ackoff = other_way->offset_before; newack = htonl(ntohl(tcph->ack_seq) - ackoff); inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, false); pr_debug("Adjusting ack number from %u->%u, ack from %u->%u\n", ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), ntohl(newack)); tcph->ack_seq = newack; res = nf_ct_sack_adjust(skb, protoff, ct, ctinfo); out: spin_unlock_bh(&ct->lock); return res; } EXPORT_SYMBOL_GPL(nf_ct_seq_adjust); s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way; if (!seqadj) return 0; this_way = &seqadj->seq[dir]; return after(seq, this_way->correction_pos) ? this_way->offset_after : this_way->offset_before; } EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
12 12 10 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 20 20 20 20 20 20 5 5 20 19 20 1 20 20 4 5 5 5 4 15 19 20 19 19 15 20 20 14 14 1 14 14 13 14 14 14 14 1 14 13 12 14 14 14 14 14 14 14 14 14 13 14 12 14 14 14 14 14 12 14 14 14 14 14 14 13 14 13 13 14 14 14 14 14 14 14 13 14 20 20 19 20 19 20 20 20 14 14 13 14 14 13 14 14 14 1 13 13 12 13 13 13 5 5 5 12 13 13 13 12 13 12 13 13 18 18 18 17 18 18 18 18 18 16 8 20 20 8 8 8 20 20 3 20 2 5 4 5 5 5 4 5 5 5 4 5 5 5 5 5 5 5 5 5 5 2 2 2 2 2 2 1 1 1 1 2 2 2 2 7 12 10 12 6 11 5 5 2 10 5 4 5 5 5 5 2 2 2 2 2 2 2 2 6 18 19 19 13 13 13 13 13 7 7 7 4 4 1 7 6 7 7 7 7 5 2 7 85 81 13 20 14 13 6 83 20 19 20 19 20 20 20 18 20 19 19 19 20 19 9 20 19 17 20 20 20 20 19 19 20 20 20 20 20 20 20 20 19 18 19 20 19 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 // SPDX-License-Identifier: GPL-2.0 /* * Block multiqueue core code * * Copyright (C) 2013-2014 Jens Axboe * Copyright (C) 2013-2014 Christoph Hellwig */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <linux/kmemleak.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/llist.h> #include <linux/cpu.h> #include <linux/cache.h> #include <linux/sched/topology.h> #include <linux/sched/signal.h> #include <linux/suspend.h> #include <linux/delay.h> #include <linux/crash_dump.h> #include <linux/prefetch.h> #include <linux/blk-crypto.h> #include <linux/part_stat.h> #include <linux/sched/isolation.h> #include <trace/events/block.h> #include <linux/t10-pi.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-pm.h" #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); static DEFINE_MUTEX(blk_mq_cpuhp_lock); static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags); static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list); static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob, unsigned int flags); /* * Check if any of the ctx, dispatch list or elevator * have pending work in this hardware queue. */ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { return !list_empty_careful(&hctx->dispatch) || sbitmap_any_bit_set(&hctx->ctx_map) || blk_mq_sched_has_work(hctx); } /* * Mark this ctx as having pending work in this hardware queue */ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { const int bit = ctx->index_hw[hctx->type]; if (!sbitmap_test_bit(&hctx->ctx_map, bit)) sbitmap_set_bit(&hctx->ctx_map, bit); } static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { const int bit = ctx->index_hw[hctx->type]; sbitmap_clear_bit(&hctx->ctx_map, bit); } struct mq_inflight { struct block_device *part; unsigned int inflight[2]; }; static bool blk_mq_check_in_driver(struct request *rq, void *priv) { struct mq_inflight *mi = priv; if (rq->rq_flags & RQF_IO_STAT && (!bdev_is_partition(mi->part) || rq->part == mi->part) && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; return true; } void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]) { struct mq_inflight mi = { .part = part }; blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver, &mi); inflight[READ] = mi.inflight[READ]; inflight[WRITE] = mi.inflight[WRITE]; } #ifdef CONFIG_LOCKDEP static bool blk_freeze_set_owner(struct request_queue *q, struct task_struct *owner) { if (!owner) return false; if (!q->mq_freeze_depth) { q->mq_freeze_owner = owner; q->mq_freeze_owner_depth = 1; q->mq_freeze_disk_dead = !q->disk || test_bit(GD_DEAD, &q->disk->state) || !blk_queue_registered(q); q->mq_freeze_queue_dying = blk_queue_dying(q); return true; } if (owner == q->mq_freeze_owner) q->mq_freeze_owner_depth += 1; return false; } /* verify the last unfreeze in owner context */ static bool blk_unfreeze_check_owner(struct request_queue *q) { if (q->mq_freeze_owner != current) return false; if (--q->mq_freeze_owner_depth == 0) { q->mq_freeze_owner = NULL; return true; } return false; } #else static bool blk_freeze_set_owner(struct request_queue *q, struct task_struct *owner) { return false; } static bool blk_unfreeze_check_owner(struct request_queue *q) { return false; } #endif bool __blk_freeze_queue_start(struct request_queue *q, struct task_struct *owner) { bool freeze; mutex_lock(&q->mq_freeze_lock); freeze = blk_freeze_set_owner(q, owner); if (++q->mq_freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); mutex_unlock(&q->mq_freeze_lock); if (queue_is_mq(q)) blk_mq_run_hw_queues(q, false); } else { mutex_unlock(&q->mq_freeze_lock); } return freeze; } void blk_freeze_queue_start(struct request_queue *q) { if (__blk_freeze_queue_start(q, current)) blk_freeze_acquire_lock(q); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); void blk_mq_freeze_queue_wait(struct request_queue *q) { wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout) { return wait_event_timeout(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter), timeout); } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); void blk_mq_freeze_queue_nomemsave(struct request_queue *q) { blk_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_nomemsave); bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { bool unfreeze; mutex_lock(&q->mq_freeze_lock); if (force_atomic) q->q_usage_counter.data->force_atomic = true; q->mq_freeze_depth--; WARN_ON_ONCE(q->mq_freeze_depth < 0); if (!q->mq_freeze_depth) { percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } unfreeze = blk_unfreeze_check_owner(q); mutex_unlock(&q->mq_freeze_lock); return unfreeze; } void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q) { if (__blk_mq_unfreeze_queue(q, false)) blk_unfreeze_release_lock(q); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_nomemrestore); /* * non_owner variant of blk_freeze_queue_start * * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen * by the same task. This is fragile and should not be used if at all * possible. */ void blk_freeze_queue_start_non_owner(struct request_queue *q) { __blk_freeze_queue_start(q, NULL); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); /* non_owner variant of blk_mq_unfreeze_queue */ void blk_mq_unfreeze_queue_non_owner(struct request_queue *q) { __blk_mq_unfreeze_queue(q, false); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner); /* * FIXME: replace the scsi_internal_device_*block_nowait() calls in the * mpt3sas driver such that this function can be removed. */ void blk_mq_quiesce_queue_nowait(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->queue_lock, flags); if (!q->quiesce_depth++) blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); spin_unlock_irqrestore(&q->queue_lock, flags); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); /** * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done * @set: tag_set to wait on * * Note: it is driver's responsibility for making sure that quiesce has * been started on or more of the request_queues of the tag_set. This * function only waits for the quiesce on those request_queues that had * the quiesce flag set using blk_mq_quiesce_queue_nowait. */ void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set) { if (set->flags & BLK_MQ_F_BLOCKING) synchronize_srcu(set->srcu); else synchronize_rcu(); } EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); /** * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished * @q: request queue. * * Note: this function does not prevent that the struct request end_io() * callback function is invoked. Once this function is returned, we make * sure no dispatch can happen until the queue is unquiesced via * blk_mq_unquiesce_queue(). */ void blk_mq_quiesce_queue(struct request_queue *q) { blk_mq_quiesce_queue_nowait(q); /* nothing to wait for non-mq queues */ if (queue_is_mq(q)) blk_mq_wait_quiesce_done(q->tag_set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); /* * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() * @q: request queue. * * This function recovers queue into the state before quiescing * which is done by blk_mq_quiesce_queue. */ void blk_mq_unquiesce_queue(struct request_queue *q) { unsigned long flags; bool run_queue = false; spin_lock_irqsave(&q->queue_lock, flags); if (WARN_ON_ONCE(q->quiesce_depth <= 0)) { ; } else if (!--q->quiesce_depth) { blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); run_queue = true; } spin_unlock_irqrestore(&q->queue_lock, flags); /* dispatch requests which are inserted during quiescing */ if (run_queue) blk_mq_run_hw_queues(q, true); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) { struct request_queue *q; rcu_read_lock(); list_for_each_entry_rcu(q, &set->tag_list, tag_set_list) { if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_quiesce_queue_nowait(q); } rcu_read_unlock(); blk_mq_wait_quiesce_done(set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set) { struct request_queue *q; rcu_read_lock(); list_for_each_entry_rcu(q, &set->tag_list, tag_set_list) { if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_unquiesce_queue(q); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset); void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_wakeup_all(hctx->tags, true); } void blk_rq_init(struct request_queue *q, struct request *rq) { memset(rq, 0, sizeof(*rq)); INIT_LIST_HEAD(&rq->queuelist); rq->q = q; rq->__sector = (sector_t) -1; rq->phys_gap_bit = 0; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG; rq->start_time_ns = blk_time_get_ns(); blk_crypto_rq_set_defaults(rq); } EXPORT_SYMBOL(blk_rq_init); /* Set start and alloc time when the allocated request is actually used */ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) { #ifdef CONFIG_BLK_RQ_ALLOC_TIME if (blk_queue_rq_alloc_time(rq->q)) rq->alloc_time_ns = alloc_time_ns; else rq->alloc_time_ns = 0; #endif } static inline void blk_mq_bio_issue_init(struct request_queue *q, struct bio *bio) { #ifdef CONFIG_BLK_CGROUP if (test_bit(QUEUE_FLAG_BIO_ISSUE_TIME, &q->queue_flags)) bio->issue_time_ns = blk_time_get_ns(); #endif } static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, struct blk_mq_tags *tags, unsigned int tag) { struct blk_mq_ctx *ctx = data->ctx; struct blk_mq_hw_ctx *hctx = data->hctx; struct request_queue *q = data->q; struct request *rq = tags->static_rqs[tag]; rq->q = q; rq->mq_ctx = ctx; rq->mq_hctx = hctx; rq->cmd_flags = data->cmd_flags; if (data->flags & BLK_MQ_REQ_PM) data->rq_flags |= RQF_PM; rq->rq_flags = data->rq_flags; if (data->rq_flags & RQF_SCHED_TAGS) { rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; } else { rq->tag = tag; rq->internal_tag = BLK_MQ_NO_TAG; } rq->timeout = 0; rq->part = NULL; rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; rq->nr_integrity_segments = 0; rq->end_io = NULL; rq->end_io_data = NULL; blk_crypto_rq_set_defaults(rq); INIT_LIST_HEAD(&rq->queuelist); /* tag was already set */ WRITE_ONCE(rq->deadline, 0); req_ref_set(rq, 1); if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = data->q->elevator; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); if (e->type->ops.prepare_request) e->type->ops.prepare_request(rq); } return rq; } static inline struct request * __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) { unsigned int tag, tag_offset; struct blk_mq_tags *tags; struct request *rq; unsigned long tag_mask; int i, nr = 0; do { tag_mask = blk_mq_get_tags(data, data->nr_tags - nr, &tag_offset); if (unlikely(!tag_mask)) { if (nr == 0) return NULL; break; } tags = blk_mq_tags_from_data(data); for (i = 0; tag_mask; i++) { if (!(tag_mask & (1UL << i))) continue; tag = tag_offset + i; prefetch(tags->static_rqs[tag]); tag_mask &= ~(1UL << i); rq = blk_mq_rq_ctx_init(data, tags, tag); rq_list_add_head(data->cached_rqs, rq); nr++; } } while (data->nr_tags > nr); if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_add_active_requests(data->hctx, nr); /* caller already holds a reference, add for remainder */ percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); data->nr_tags -= nr; return rq_list_pop(data->cached_rqs); } static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) { struct request_queue *q = data->q; u64 alloc_time_ns = 0; struct request *rq; unsigned int tag; /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) alloc_time_ns = blk_time_get_ns(); if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); if (q->elevator) { /* * All requests use scheduler tags when an I/O scheduler is * enabled for the queue. */ data->rq_flags |= RQF_SCHED_TAGS; /* * Flush/passthrough requests are special and go directly to the * dispatch list. */ if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH && !blk_op_is_passthrough(data->cmd_flags)) { struct elevator_mq_ops *ops = &q->elevator->type->ops; WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED); data->rq_flags |= RQF_USE_SCHED; if (ops->limit_depth) ops->limit_depth(data->cmd_flags, data); } } else { blk_mq_tag_busy(data->hctx); } if (data->flags & BLK_MQ_REQ_RESERVED) data->rq_flags |= RQF_RESV; /* * Try batched alloc if we want more than 1 tag. */ if (data->nr_tags > 1) { rq = __blk_mq_alloc_requests_batch(data); if (rq) { blk_mq_rq_time_init(rq, alloc_time_ns); return rq; } data->nr_tags = 1; } /* * Waiting allocations only fail because of an inactive hctx. In that * case just retry the hctx assignment and tag allocation as CPU hotplug * should have migrated us to an online CPU by now. */ tag = blk_mq_get_tag(data); if (tag == BLK_MQ_NO_TAG) { if (data->flags & BLK_MQ_REQ_NOWAIT) return NULL; /* * Give up the CPU and sleep for a random short time to * ensure that thread using a realtime scheduling class * are migrated off the CPU, and thus off the hctx that * is going away. */ msleep(3); goto retry; } if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_inc_active_requests(data->hctx); rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); return rq; } static struct request *blk_mq_rq_cache_fill(struct request_queue *q, struct blk_plug *plug, blk_opf_t opf, blk_mq_req_flags_t flags) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .shallow_depth = 0, .cmd_flags = opf, .rq_flags = 0, .nr_tags = plug->nr_ios, .cached_rqs = &plug->cached_rqs, .ctx = NULL, .hctx = NULL }; struct request *rq; if (blk_queue_enter(q, flags)) return NULL; plug->nr_ios = 1; rq = __blk_mq_alloc_requests(&data); if (unlikely(!rq)) blk_queue_exit(q); return rq; } static struct request *blk_mq_alloc_cached_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags) { struct blk_plug *plug = current->plug; struct request *rq; if (!plug) return NULL; if (rq_list_empty(&plug->cached_rqs)) { if (plug->nr_ios == 1) return NULL; rq = blk_mq_rq_cache_fill(q, plug, opf, flags); if (!rq) return NULL; } else { rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) return NULL; if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; rq_list_pop(&plug->cached_rqs); blk_mq_rq_time_init(rq, blk_time_get_ns()); } rq->cmd_flags = opf; INIT_LIST_HEAD(&rq->queuelist); return rq; } struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags) { struct request *rq; rq = blk_mq_alloc_cached_request(q, opf, flags); if (!rq) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .shallow_depth = 0, .cmd_flags = opf, .rq_flags = 0, .nr_tags = 1, .cached_rqs = NULL, .ctx = NULL, .hctx = NULL }; int ret; ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); rq = __blk_mq_alloc_requests(&data); if (!rq) goto out_queue_exit; } rq->__data_len = 0; rq->phys_gap_bit = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; out_queue_exit: blk_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); } EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .shallow_depth = 0, .cmd_flags = opf, .rq_flags = 0, .nr_tags = 1, .cached_rqs = NULL, .ctx = NULL, .hctx = NULL }; u64 alloc_time_ns = 0; struct request *rq; unsigned int cpu; unsigned int tag; int ret; /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) alloc_time_ns = blk_time_get_ns(); /* * If the tag allocator sleeps we could get an allocation for a * different hardware context. No need to complicate the low level * allocator for this for the rare use case of a command tied to * a specific queue. */ if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) || WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED))) return ERR_PTR(-EINVAL); if (hctx_idx >= q->nr_hw_queues) return ERR_PTR(-EIO); ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); /* * Check if the hardware context is actually mapped to anything. * If not tell the caller that it should skip this queue. */ ret = -EXDEV; data.hctx = q->queue_hw_ctx[hctx_idx]; if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); if (cpu >= nr_cpu_ids) goto out_queue_exit; data.ctx = __blk_mq_get_ctx(q, cpu); if (q->elevator) data.rq_flags |= RQF_SCHED_TAGS; else blk_mq_tag_busy(data.hctx); if (flags & BLK_MQ_REQ_RESERVED) data.rq_flags |= RQF_RESV; ret = -EWOULDBLOCK; tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; if (!(data.rq_flags & RQF_SCHED_TAGS)) blk_mq_inc_active_requests(data.hctx); rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); rq->__data_len = 0; rq->phys_gap_bit = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; out_queue_exit: blk_queue_exit(q); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); static void blk_mq_finish_request(struct request *rq) { struct request_queue *q = rq->q; blk_zone_finish_request(rq); if (rq->rq_flags & RQF_USE_SCHED) { q->elevator->type->ops.finish_request(rq); /* * For postflush request that may need to be * completed twice, we should clear this flag * to avoid double finish_request() on the rq. */ rq->rq_flags &= ~RQF_USE_SCHED; } } static void __blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; const int sched_tag = rq->internal_tag; blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); rq->mq_hctx = NULL; if (rq->tag != BLK_MQ_NO_TAG) { blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, ctx, rq->tag); } if (sched_tag != BLK_MQ_NO_TAG) blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); blk_mq_sched_restart(hctx); blk_queue_exit(q); } void blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; blk_mq_finish_request(rq); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) laptop_io_completion(q->disk->bdi); rq_qos_done(q, rq); WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (req_ref_put_and_test(rq)) __blk_mq_free_request(rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); void blk_mq_free_plug_rqs(struct blk_plug *plug) { struct request *rq; while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL) blk_mq_free_request(rq); } void blk_dump_rq_flags(struct request *rq, char *msg) { printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, rq->q->disk ? rq->q->disk->disk_name : "?", (__force unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); printk(KERN_INFO " bio %p, biotail %p, len %u\n", rq->bio, rq->biotail, blk_rq_bytes(rq)); } EXPORT_SYMBOL(blk_dump_rq_flags); static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (req->rq_flags & RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); part_stat_add(req->part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } static void blk_print_req_error(struct request *req, blk_status_t status) { printk_ratelimited(KERN_ERR "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " "phys_seg %u prio class %u\n", blk_status_to_str(status), req->q->disk ? req->q->disk->disk_name : "?", blk_rq_pos(req), (__force u32)req_op(req), blk_op_str(req_op(req)), (__force u32)(req->cmd_flags & ~REQ_OP_MASK), req->nr_phys_segments, IOPRIO_PRIO_CLASS(req_get_ioprio(req))); } /* * Fully end IO on a request. Does not support partial completions, or * errors. */ static void blk_complete_request(struct request *req) { const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0; int total_bytes = blk_rq_bytes(req); struct bio *bio = req->bio; trace_block_rq_complete(req, BLK_STS_OK, total_bytes); if (!bio) return; if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) blk_integrity_complete(req, total_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last * bio_endio(). Therefore, the keyslot must be released before that. */ blk_crypto_rq_put_keyslot(req); blk_account_io_completion(req, total_bytes); do { struct bio *next = bio->bi_next; /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); if (blk_req_bio_is_zone_append(req, bio)) blk_zone_append_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); bio = next; } while (bio); /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later. */ if (!req->end_io) { req->bio = NULL; req->__data_len = 0; } } /** * blk_update_request - Complete multiple bytes without completing the request * @req: the request being processed * @error: block status code * @nr_bytes: number of bytes to complete for @req * * Description: * Ends I/O on a number of bytes attached to @req, but doesn't complete * the request structure even if @req doesn't have leftover. * If @req has leftover, sets it up for the next range of segments. * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * * Note: * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function * except in the consistency check at the end of this function. * * Return: * %false - this request doesn't have any more data * %true - this request has more data **/ bool blk_update_request(struct request *req, blk_status_t error, unsigned int nr_bytes) { bool is_flush = req->rq_flags & RQF_FLUSH_SEQ; bool quiet = req->rq_flags & RQF_QUIET; int total_bytes; trace_block_rq_complete(req, error, nr_bytes); if (!req->bio) return false; if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && error == BLK_STS_OK) blk_integrity_complete(req, nr_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last * bio_endio(). Therefore, the keyslot must be released before that. */ if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) __blk_crypto_rq_put_keyslot(req); if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) && !test_bit(GD_DEAD, &req->q->disk->state)) { blk_print_req_error(req, error); trace_block_rq_error(req, error, nr_bytes); } blk_account_io_completion(req, nr_bytes); total_bytes = 0; while (req->bio) { struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); if (unlikely(error)) bio->bi_status = error; if (bio_bytes == bio->bi_iter.bi_size) { req->bio = bio->bi_next; } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) { /* * Partial zone append completions cannot be supported * as the BIO fragments may end up not being written * sequentially. */ bio->bi_status = BLK_STS_IOERR; } /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); if (unlikely(quiet)) bio_set_flag(bio, BIO_QUIET); bio_advance(bio, bio_bytes); /* Don't actually finish bio if it's part of flush sequence */ if (!bio->bi_iter.bi_size) { if (blk_req_bio_is_zone_append(req, bio)) blk_zone_append_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); } total_bytes += bio_bytes; nr_bytes -= bio_bytes; if (!nr_bytes) break; } /* * completely done */ if (!req->bio) { /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later. */ req->__data_len = 0; return false; } req->__data_len -= total_bytes; /* update sector only for requests with clear definition of sector */ if (!blk_rq_is_passthrough(req)) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ if (req->rq_flags & RQF_MIXED_MERGE) { req->cmd_flags &= ~REQ_FAILFAST_MASK; req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; } if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { /* * If total number of sectors is less than the first segment * size, something has gone terribly wrong. */ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { blk_dump_rq_flags(req, "request botched"); req->__data_len = blk_rq_cur_bytes(req); } /* recalculate the number of segments */ req->nr_phys_segments = blk_recalc_rq_segments(req); } return true; } EXPORT_SYMBOL_GPL(blk_update_request); static inline void blk_account_io_done(struct request *req, u64 now) { trace_block_io_done(req); /* * Account IO completion. flush_rq isn't accounted as a * normal IO on queueing nor completion. Accounting the * containing request is enough. */ if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); update_io_ticks(req->part, jiffies, true); part_stat_inc(req->part, ios[sgrp]); part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); part_stat_local_dec(req->part, in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } static inline bool blk_rq_passthrough_stats(struct request *req) { struct bio *bio = req->bio; if (!blk_queue_passthrough_stat(req->q)) return false; /* Requests without a bio do not transfer data. */ if (!bio) return false; /* * Stats are accumulated in the bdev, so must have one attached to a * bio to track stats. Most drivers do not set the bdev for passthrough * requests, but nvme is one that will set it. */ if (!bio->bi_bdev) return false; /* * We don't know what a passthrough command does, but we know the * payload size and data direction. Ensuring the size is aligned to the * block size filters out most commands with payloads that don't * represent sector access. */ if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1)) return false; return true; } static inline void blk_account_io_start(struct request *req) { trace_block_io_start(req); if (!blk_queue_io_stat(req->q)) return; if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req)) return; req->rq_flags |= RQF_IO_STAT; req->start_time_ns = blk_time_get_ns(); /* * All non-passthrough requests are created from a bio with one * exception: when a flush command that is part of a flush sequence * generated by the state machine in blk-flush.c is cloned onto the * lower device by dm-multipath we can get here without a bio. */ if (req->bio) req->part = req->bio->bi_bdev; else req->part = req->q->disk->part0; part_stat_lock(); update_io_ticks(req->part, jiffies, false); part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) { if (rq->rq_flags & RQF_STATS) blk_stat_add(rq, now); blk_mq_sched_completed_request(rq, now); blk_account_io_done(rq, now); } inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_mq_need_time_stamp(rq)) __blk_mq_end_request_acct(rq, blk_time_get_ns()); blk_mq_finish_request(rq); if (rq->end_io) { rq_qos_done(rq->q, rq); if (rq->end_io(rq, error) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else { blk_mq_free_request(rq); } } EXPORT_SYMBOL(__blk_mq_end_request); void blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_update_request(rq, error, blk_rq_bytes(rq))) BUG(); __blk_mq_end_request(rq, error); } EXPORT_SYMBOL(blk_mq_end_request); #define TAG_COMP_BATCH 32 static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, int *tag_array, int nr_tags) { struct request_queue *q = hctx->queue; blk_mq_sub_active_requests(hctx, nr_tags); blk_mq_put_tags(hctx->tags, tag_array, nr_tags); percpu_ref_put_many(&q->q_usage_counter, nr_tags); } void blk_mq_end_request_batch(struct io_comp_batch *iob) { int tags[TAG_COMP_BATCH], nr_tags = 0; struct blk_mq_hw_ctx *cur_hctx = NULL; struct request *rq; u64 now = 0; if (iob->need_ts) now = blk_time_get_ns(); while ((rq = rq_list_pop(&iob->req_list)) != NULL) { prefetch(rq->bio); prefetch(rq->rq_next); blk_complete_request(rq); if (iob->need_ts) __blk_mq_end_request_acct(rq, now); blk_mq_finish_request(rq); rq_qos_done(rq->q, rq); /* * If end_io handler returns NONE, then it still has * ownership of the request. */ if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) continue; WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (!req_ref_put_and_test(rq)) continue; blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) { if (cur_hctx) blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); nr_tags = 0; cur_hctx = rq->mq_hctx; } tags[nr_tags++] = rq->tag; } if (nr_tags) blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); } EXPORT_SYMBOL_GPL(blk_mq_end_request_batch); static void blk_complete_reqs(struct llist_head *list) { struct llist_node *entry = llist_reverse_order(llist_del_all(list)); struct request *rq, *next; llist_for_each_entry_safe(rq, next, entry, ipi_list) rq->q->mq_ops->complete(rq); } static __latent_entropy void blk_done_softirq(void) { blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); } static int blk_softirq_cpu_dead(unsigned int cpu) { blk_complete_reqs(&per_cpu(blk_cpu_done, cpu)); return 0; } static void __blk_mq_complete_request_remote(void *data) { __raise_softirq_irqoff(BLOCK_SOFTIRQ); } static inline bool blk_mq_complete_need_ipi(struct request *rq) { int cpu = raw_smp_processor_id(); if (!IS_ENABLED(CONFIG_SMP) || !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) return false; /* * With force threaded interrupts enabled, raising softirq from an SMP * function call will always result in waking the ksoftirqd thread. * This is probably worse than completing the request on a different * cache domain. */ if (force_irqthreads()) return false; /* same CPU or cache domain and capacity? Complete locally */ if (cpu == rq->mq_ctx->cpu || (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && cpus_share_cache(cpu, rq->mq_ctx->cpu) && cpus_equal_capacity(cpu, rq->mq_ctx->cpu))) return false; /* don't try to IPI to an offline CPU */ return cpu_online(rq->mq_ctx->cpu); } static void blk_mq_complete_send_ipi(struct request *rq) { unsigned int cpu; cpu = rq->mq_ctx->cpu; if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu))) smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu)); } static void blk_mq_raise_softirq(struct request *rq) { struct llist_head *list; preempt_disable(); list = this_cpu_ptr(&blk_cpu_done); if (llist_add(&rq->ipi_list, list)) raise_softirq(BLOCK_SOFTIRQ); preempt_enable(); } bool blk_mq_complete_request_remote(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* * For request which hctx has only one ctx mapping, * or a polled request, always complete locally, * it's pointless to redirect the completion. */ if ((rq->mq_hctx->nr_ctx == 1 && rq->mq_ctx->cpu == raw_smp_processor_id()) || rq->cmd_flags & REQ_POLLED) return false; if (blk_mq_complete_need_ipi(rq)) { blk_mq_complete_send_ipi(rq); return true; } if (rq->q->nr_hw_queues == 1) { blk_mq_raise_softirq(rq); return true; } return false; } EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed * * Description: * Complete a request by scheduling the ->complete_rq operation. **/ void blk_mq_complete_request(struct request *rq) { if (!blk_mq_complete_request_remote(rq)) rq->q->mq_ops->complete(rq); } EXPORT_SYMBOL(blk_mq_complete_request); /** * blk_mq_start_request - Start processing a request * @rq: Pointer to request to be started * * Function used by device drivers to notify the block layer that a request * is going to be processed now, so blk layer can do proper initializations * such as starting the timeout timer. */ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; trace_block_rq_issue(rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && !blk_rq_is_passthrough(rq)) { rq->io_start_time_ns = blk_time_get_ns(); rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); } WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); blk_add_timer(rq); WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); rq->mq_hctx->tags->rqs[rq->tag] = rq; if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) blk_integrity_prepare(rq); if (rq->bio && rq->bio->bi_opf & REQ_POLLED) WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); } EXPORT_SYMBOL(blk_mq_start_request); /* * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple * queues. This is important for md arrays to benefit from merging * requests. */ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) { if (plug->multiple_queues) return BLK_MAX_REQUEST_COUNT * 2; return BLK_MAX_REQUEST_COUNT; } static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) { struct request *last = rq_list_peek(&plug->mq_list); if (!plug->rq_count) { trace_block_plug(rq->q); } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || (!blk_queue_nomerges(rq->q) && blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { blk_mq_flush_plug_list(plug, false); last = NULL; trace_block_plug(rq->q); } if (!plug->multiple_queues && last && last->q != rq->q) plug->multiple_queues = true; /* * Any request allocated from sched tags can't be issued to * ->queue_rqs() directly */ if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) plug->has_elevator = true; rq_list_add_tail(&plug->mq_list, rq); plug->rq_count++; } /** * blk_execute_rq_nowait - insert a request to I/O scheduler for execution * @rq: request to insert * @at_head: insert request at head or tail of queue * * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution. Don't wait for completion. * * Note: * This function will invoke @done directly if the queue is dead. */ void blk_execute_rq_nowait(struct request *rq, bool at_head) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); blk_account_io_start(rq); if (current->plug && !at_head) { blk_add_rq_to_plug(current->plug, rq); return; } blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); struct blk_rq_wait { struct completion done; blk_status_t ret; }; static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) { struct blk_rq_wait *wait = rq->end_io_data; wait->ret = ret; complete(&wait->done); return RQ_END_IO_NONE; } bool blk_rq_is_poll(struct request *rq) { if (!rq->mq_hctx) return false; if (rq->mq_hctx->type != HCTX_TYPE_POLL) return false; return true; } EXPORT_SYMBOL_GPL(blk_rq_is_poll); static void blk_rq_poll_completion(struct request *rq, struct completion *wait) { do { blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0); cond_resched(); } while (!completion_done(wait)); } /** * blk_execute_rq - insert a request into queue for execution * @rq: request to insert * @at_head: insert request at head or tail of queue * * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution and wait for completion. * Return: The blk_status_t result provided to blk_mq_end_request(). */ blk_status_t blk_execute_rq(struct request *rq, bool at_head) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_rq_wait wait = { .done = COMPLETION_INITIALIZER_ONSTACK(wait.done), }; WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); rq->end_io_data = &wait; rq->end_io = blk_end_sync_rq; blk_account_io_start(rq); blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); blk_mq_run_hw_queue(hctx, false); if (blk_rq_is_poll(rq)) blk_rq_poll_completion(rq, &wait.done); else blk_wait_io(&wait.done); return wait.ret; } EXPORT_SYMBOL(blk_execute_rq); static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; blk_mq_put_driver_tag(rq); trace_block_rq_requeue(rq); rq_qos_requeue(q, rq); if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); rq->rq_flags &= ~RQF_TIMED_OUT; } } void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) { struct request_queue *q = rq->q; unsigned long flags; __blk_mq_requeue_request(rq); /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); spin_lock_irqsave(&q->requeue_lock, flags); list_add_tail(&rq->queuelist, &q->requeue_list); spin_unlock_irqrestore(&q->requeue_lock, flags); if (kick_requeue_list) blk_mq_kick_requeue_list(q); } EXPORT_SYMBOL(blk_mq_requeue_request); static void blk_mq_requeue_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, requeue_work.work); LIST_HEAD(rq_list); LIST_HEAD(flush_list); struct request *rq; spin_lock_irq(&q->requeue_lock); list_splice_init(&q->requeue_list, &rq_list); list_splice_init(&q->flush_list, &flush_list); spin_unlock_irq(&q->requeue_lock); while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); /* * If RQF_DONTPREP is set, the request has been started by the * driver already and might have driver-specific data allocated * already. Insert it into the hctx dispatch list to avoid * block layer merges for the request. */ if (rq->rq_flags & RQF_DONTPREP) blk_mq_request_bypass_insert(rq, 0); else blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); } while (!list_empty(&flush_list)) { rq = list_entry(flush_list.next, struct request, queuelist); list_del_init(&rq->queuelist); blk_mq_insert_request(rq, 0); } blk_mq_run_hw_queues(q, false); } void blk_mq_kick_requeue_list(struct request_queue *q) { kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs) { kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); static bool blk_is_flush_data_rq(struct request *rq) { return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq); } static bool blk_mq_rq_inflight(struct request *rq, void *priv) { /* * If we find a request that isn't idle we know the queue is busy * as it's checked in the iter. * Return false to stop the iteration. * * In case of queue quiesce, if one flush data request is completed, * don't count it as inflight given the flush sequence is suspended, * and the original flush data request is invisible to driver, just * like other pending requests because of quiesce */ if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) && blk_is_flush_data_rq(rq) && blk_mq_request_completed(rq))) { bool *busy = priv; *busy = true; return false; } return true; } bool blk_mq_queue_inflight(struct request_queue *q) { bool busy = false; blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); return busy; } EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); static void blk_mq_rq_timed_out(struct request *req) { req->rq_flags |= RQF_TIMED_OUT; if (req->q->mq_ops->timeout) { enum blk_eh_timer_return ret; ret = req->q->mq_ops->timeout(req); if (ret == BLK_EH_DONE) return; WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); } blk_add_timer(req); } struct blk_expired_data { bool has_timedout_rq; unsigned long next; unsigned long timeout_start; }; static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired) { unsigned long deadline; if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) return false; if (rq->rq_flags & RQF_TIMED_OUT) return false; deadline = READ_ONCE(rq->deadline); if (time_after_eq(expired->timeout_start, deadline)) return true; if (expired->next == 0) expired->next = deadline; else if (time_after(expired->next, deadline)) expired->next = deadline; return false; } void blk_mq_put_rq_ref(struct request *rq) { if (is_flush_rq(rq)) { if (rq->end_io(rq, 0) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else if (req_ref_put_and_test(rq)) { __blk_mq_free_request(rq); } } static bool blk_mq_check_expired(struct request *rq, void *priv) { struct blk_expired_data *expired = priv; /* * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot * be reallocated underneath the timeout handler's processing, then * the expire check is reliable. If the request is not expired, then * it was completed and reallocated as a new request after returning * from blk_mq_check_expired(). */ if (blk_mq_req_expired(rq, expired)) { expired->has_timedout_rq = true; return false; } return true; } static bool blk_mq_handle_expired(struct request *rq, void *priv) { struct blk_expired_data *expired = priv; if (blk_mq_req_expired(rq, expired)) blk_mq_rq_timed_out(rq); return true; } static void blk_mq_timeout_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, timeout_work); struct blk_expired_data expired = { .timeout_start = jiffies, }; struct blk_mq_hw_ctx *hctx; unsigned long i; /* A deadlock might occur if a request is stuck requiring a * timeout at the same time a queue freeze is waiting * completion, since the timeout code would not be able to * acquire the queue reference here. * * That's why we don't use blk_queue_enter here; instead, we use * percpu_ref_tryget directly, because we need to be able to * obtain a reference even in the short window between the queue * starting to freeze, by dropping the first reference in * blk_freeze_queue_start, and the moment the last request is * consumed, marked by the instant q_usage_counter reaches * zero. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return; /* check if there is any timed-out request */ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired); if (expired.has_timedout_rq) { /* * Before walking tags, we must ensure any submit started * before the current time has finished. Since the submit * uses srcu or rcu, wait for a synchronization point to * ensure all running submits have finished */ blk_mq_wait_quiesce_done(q->tag_set); expired.next = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired); } if (expired.next != 0) { mod_timer(&q->timeout, expired.next); } else { /* * Request timeouts are handled as a forward rolling timer. If * we end up here it means that no requests are pending and * also that no request has been pending for a while. Mark * each hctx as idle. */ queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); } } blk_queue_exit(q); } struct flush_busy_ctx_data { struct blk_mq_hw_ctx *hctx; struct list_head *list; }; static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) { struct flush_busy_ctx_data *flush_data = data; struct blk_mq_hw_ctx *hctx = flush_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; enum hctx_type type = hctx->type; spin_lock(&ctx->lock); list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); sbitmap_clear_bit(sb, bitnr); spin_unlock(&ctx->lock); return true; } /* * Process software queues that have been marked busy, splicing them * to the for-dispatch */ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct flush_busy_ctx_data data = { .hctx = hctx, .list = list, }; sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); } struct dispatch_rq_data { struct blk_mq_hw_ctx *hctx; struct request *rq; }; static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) { struct dispatch_rq_data *dispatch_data = data; struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; enum hctx_type type = hctx->type; spin_lock(&ctx->lock); if (!list_empty(&ctx->rq_lists[type])) { dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); list_del_init(&dispatch_data->rq->queuelist); if (list_empty(&ctx->rq_lists[type])) sbitmap_clear_bit(sb, bitnr); } spin_unlock(&ctx->lock); return !dispatch_data->rq; } struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start) { unsigned off = start ? start->index_hw[hctx->type] : 0; struct dispatch_rq_data data = { .hctx = hctx, .rq = NULL, }; __sbitmap_for_each_set(&hctx->ctx_map, off, dispatch_rq_from_ctx, &data); return data.rq; } bool __blk_mq_alloc_driver_tag(struct request *rq) { struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; int tag; blk_mq_tag_busy(rq->mq_hctx); if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { bt = &rq->mq_hctx->tags->breserved_tags; tag_offset = 0; } else { if (!hctx_may_queue(rq->mq_hctx, bt)) return false; } tag = __sbitmap_queue_get(bt); if (tag == BLK_MQ_NO_TAG) return false; rq->tag = tag + tag_offset; blk_mq_inc_active_requests(rq->mq_hctx); return true; } static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { struct blk_mq_hw_ctx *hctx; hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); spin_lock(&hctx->dispatch_wait_lock); if (!list_empty(&wait->entry)) { struct sbitmap_queue *sbq; list_del_init(&wait->entry); sbq = &hctx->tags->bitmap_tags; atomic_dec(&sbq->ws_active); } spin_unlock(&hctx->dispatch_wait_lock); blk_mq_run_hw_queue(hctx, true); return 1; } /* * Mark us waiting for a tag. For shared tags, this involves hooking us into * the tag wakeups. For non-shared tags, we can simply mark us needing a * restart. For both cases, take care to check the condition again after * marking us as waiting. */ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct sbitmap_queue *sbq; struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && !(blk_mq_is_shared_tags(hctx->flags))) { blk_mq_sched_mark_restart_hctx(hctx); /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. * * Don't clear RESTART here, someone else could have set it. * At most this will cost an extra queue run. */ return blk_mq_get_driver_tag(rq); } wait = &hctx->dispatch_wait; if (!list_empty_careful(&wait->entry)) return false; if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) sbq = &hctx->tags->breserved_tags; else sbq = &hctx->tags->bitmap_tags; wq = &bt_wait_ptr(sbq, hctx)->wait; spin_lock_irq(&wq->lock); spin_lock(&hctx->dispatch_wait_lock); if (!list_empty(&wait->entry)) { spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); return false; } atomic_inc(&sbq->ws_active); wait->flags &= ~WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq, wait); /* * Add one explicit barrier since blk_mq_get_driver_tag() may * not imply barrier in case of failure. * * Order adding us to wait queue and allocating driver tag. * * The pair is the one implied in sbitmap_queue_wake_up() which * orders clearing sbitmap tag bits and waitqueue_active() in * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless * * Otherwise, re-order of adding wait queue and getting driver tag * may cause __sbitmap_queue_wake_up() to wake up nothing because * the waitqueue_active() may not observe us in wait queue. */ smp_mb(); /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. */ ret = blk_mq_get_driver_tag(rq); if (!ret) { spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); return false; } /* * We got a tag, remove ourselves from the wait queue to ensure * someone else gets the wakeup. */ list_del_init(&wait->entry); atomic_dec(&sbq->ws_active); spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); return true; } #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 /* * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): * - EWMA is one simple way to compute running average value * - weight(7/8 and 1/8) is applied so that it can decrease exponentially * - take 4 as factor for avoiding to get too small(0) result, and this * factor doesn't matter because EWMA decreases exponentially */ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) { unsigned int ewma; ewma = hctx->dispatch_busy; if (!ewma && !busy) return; ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; if (busy) ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; hctx->dispatch_busy = ewma; } #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ static void blk_mq_handle_dev_resource(struct request *rq, struct list_head *list) { list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); } enum prep_dispatch { PREP_DISPATCH_OK, PREP_DISPATCH_NO_TAG, PREP_DISPATCH_NO_BUDGET, }; static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, bool need_budget) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; int budget_token = -1; if (need_budget) { budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) { blk_mq_put_driver_tag(rq); return PREP_DISPATCH_NO_BUDGET; } blk_mq_set_rq_budget_token(rq, budget_token); } if (!blk_mq_get_driver_tag(rq)) { /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. The * waitqueue takes care of that. If the queue is run * before we add this entry back on the dispatch list, * we'll re-run it below. */ if (!blk_mq_mark_tag_wait(hctx, rq)) { /* * All budgets not got from this function will be put * together during handling partial dispatch */ if (need_budget) blk_mq_put_dispatch_budget(rq->q, budget_token); return PREP_DISPATCH_NO_TAG; } } return PREP_DISPATCH_OK; } /* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ static void blk_mq_release_budgets(struct request_queue *q, struct list_head *list) { struct request *rq; list_for_each_entry(rq, list, queuelist) { int budget_token = blk_mq_get_rq_budget_token(rq); if (budget_token >= 0) blk_mq_put_dispatch_budget(q, budget_token); } } /* * blk_mq_commit_rqs will notify driver using bd->last that there is no * more requests. (See comment in struct blk_mq_ops for commit_rqs for * details) * Attention, we should explicitly call this in unusual cases: * 1) did not queue everything initially scheduled to queue * 2) the last attempt to queue a request failed */ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, bool from_schedule) { if (hctx->queue->mq_ops->commit_rqs && queued) { trace_block_unplug(hctx->queue, queued, !from_schedule); hctx->queue->mq_ops->commit_rqs(hctx); } } /* * Returns true if we did some work AND can potentially do more. */ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool get_budget) { enum prep_dispatch prep; struct request_queue *q = hctx->queue; struct request *rq; int queued; blk_status_t ret = BLK_STS_OK; bool needs_resource = false; if (list_empty(list)) return false; /* * Now process all the entries, sending them to the driver. */ queued = 0; do { struct blk_mq_queue_data bd; rq = list_first_entry(list, struct request, queuelist); WARN_ON_ONCE(hctx != rq->mq_hctx); prep = blk_mq_prep_dispatch_rq(rq, get_budget); if (prep != PREP_DISPATCH_OK) break; list_del_init(&rq->queuelist); bd.rq = rq; bd.last = list_empty(list); ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: queued++; break; case BLK_STS_RESOURCE: needs_resource = true; fallthrough; case BLK_STS_DEV_RESOURCE: blk_mq_handle_dev_resource(rq, list); goto out; default: blk_mq_end_request(rq, ret); } } while (!list_empty(list)); out: /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ if (!list_empty(list) || ret != BLK_STS_OK) blk_mq_commit_rqs(hctx, queued, false); /* * Any items that need requeuing? Stuff them into hctx->dispatch, * that is where we will continue on next queue run. */ if (!list_empty(list)) { bool needs_restart; /* For non-shared tags, the RESTART check will suffice */ bool no_tag = prep == PREP_DISPATCH_NO_TAG && ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) || blk_mq_is_shared_tags(hctx->flags)); /* * If the caller allocated budgets, free the budgets of the * requests that have not yet been passed to the block driver. */ if (!get_budget) blk_mq_release_budgets(q, list); spin_lock(&hctx->lock); list_splice_tail_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); /* * Order adding requests to hctx->dispatch and checking * SCHED_RESTART flag. The pair of this smp_mb() is the one * in blk_mq_sched_restart(). Avoid restart code path to * miss the new added requests to hctx->dispatch, meantime * SCHED_RESTART is observed here. */ smp_mb(); /* * If SCHED_RESTART was set by the caller of this function and * it is no longer set that means that it was cleared by another * thread and hence that a queue rerun is needed. * * If 'no_tag' is set, that means that we failed getting * a driver tag with an I/O scheduler attached. If our dispatch * waitqueue is no longer active, ensure that we run the queue * AFTER adding our entries back to the list. * * If no I/O scheduler has been configured it is possible that * the hardware queue got stopped and restarted before requests * were pushed back onto the dispatch list. Rerun the queue to * avoid starvation. Notes: * - blk_mq_run_hw_queue() checks whether or not a queue has * been stopped before rerunning a queue. * - Some but not all block drivers stop a queue before * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * and dm-rq. * * If driver returns BLK_STS_RESOURCE and SCHED_RESTART * bit is set, run queue after a delay to avoid IO stalls * that could otherwise occur if the queue is idle. We'll do * similar if we couldn't get budget or couldn't lock a zone * and SCHED_RESTART is set. */ needs_restart = blk_mq_sched_needs_restart(hctx); if (prep == PREP_DISPATCH_NO_BUDGET) needs_resource = true; if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); else if (needs_resource) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); return false; } blk_mq_update_dispatch_busy(hctx, false); return true; } static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) { int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(hctx->cpumask); return cpu; } /* * ->next_cpu is always calculated from hctx->cpumask, so simply use * it for speeding up the check */ static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx) { return hctx->next_cpu >= nr_cpu_ids; } /* * It'd be great if the workqueue API had a way to pass * in a mask and had some smarts for more clever placement. * For now we just round-robin here, switching for every * BLK_MQ_CPU_WORK_BATCH queued items. */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { bool tried = false; int next_cpu = hctx->next_cpu; /* Switch to unbound if no allowable CPUs in this hctx */ if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx)) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { select_cpu: next_cpu = cpumask_next_and(next_cpu, hctx->cpumask, cpu_online_mask); if (next_cpu >= nr_cpu_ids) next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } /* * Do unbound schedule if we can't find a online CPU for this hctx, * and it should only happen in the path of handling CPU DEAD. */ if (!cpu_online(next_cpu)) { if (!tried) { tried = true; goto select_cpu; } /* * Make sure to re-select CPU next time once after CPUs * in hctx->cpumask become online again. */ hctx->next_cpu = next_cpu; hctx->next_cpu_batch = 1; return WORK_CPU_UNBOUND; } hctx->next_cpu = next_cpu; return next_cpu; } /** * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. * @hctx: Pointer to the hardware queue to run. * @msecs: Milliseconds of delay to wait before running the queue. * * Run a hardware queue asynchronously with a delay of @msecs. */ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { if (unlikely(blk_mq_hctx_stopped(hctx))) return; kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx) { bool need_run; /* * When queue is quiesced, we may be switching io scheduler, or * updating nr_hw_queues, or other things, and we can't run queue * any more, even blk_mq_hctx_has_pending() can't be called safely. * * And queue will be rerun in blk_mq_unquiesce_queue() if it is * quiesced. */ __blk_mq_run_dispatch_ops(hctx->queue, false, need_run = !blk_queue_quiesced(hctx->queue) && blk_mq_hctx_has_pending(hctx)); return need_run; } /** * blk_mq_run_hw_queue - Start to run a hardware queue. * @hctx: Pointer to the hardware queue to run. * @async: If we want to run the queue asynchronously. * * Check if the request queue is not in a quiesced state and if there are * pending requests to be sent. If this is true, run the queue to send requests * to hardware. */ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { bool need_run; /* * We can't run the queue inline with interrupts disabled. */ WARN_ON_ONCE(!async && in_interrupt()); might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); need_run = blk_mq_hw_queue_need_run(hctx); if (!need_run) { unsigned long flags; /* * Synchronize with blk_mq_unquiesce_queue(), because we check * if hw queue is quiesced locklessly above, we need the use * ->queue_lock to make sure we see the up-to-date status to * not miss rerunning the hw queue. */ spin_lock_irqsave(&hctx->queue->queue_lock, flags); need_run = blk_mq_hw_queue_need_run(hctx); spin_unlock_irqrestore(&hctx->queue->queue_lock, flags); if (!need_run) return; } if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { blk_mq_delay_run_hw_queue(hctx, 0); return; } blk_mq_run_dispatch_ops(hctx->queue, blk_mq_sched_dispatch_requests(hctx)); } EXPORT_SYMBOL(blk_mq_run_hw_queue); /* * Return prefered queue to dispatch from (if any) for non-mq aware IO * scheduler. */ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) { struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); /* * If the IO scheduler does not respect hardware queues when * dispatching, we just don't bother with multiple HW queues and * dispatch from hctx for the current CPU since running multiple queues * just causes lock contention inside the scheduler and pointless cache * bouncing. */ struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT]; if (!blk_mq_hctx_stopped(hctx)) return hctx; return NULL; } /** * blk_mq_run_hw_queues - Run all hardware queues in a request queue. * @q: Pointer to the request queue to run. * @async: If we want to run the queue asynchronously. */ void blk_mq_run_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx, *sq_hctx; unsigned long i; sq_hctx = NULL; if (blk_queue_sq_sched(q)) sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the * scheduler. */ if (!sq_hctx || sq_hctx == hctx || !list_empty_careful(&hctx->dispatch)) blk_mq_run_hw_queue(hctx, async); } } EXPORT_SYMBOL(blk_mq_run_hw_queues); /** * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. * @q: Pointer to the request queue to run. * @msecs: Milliseconds of delay to wait before running the queues. */ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { struct blk_mq_hw_ctx *hctx, *sq_hctx; unsigned long i; sq_hctx = NULL; if (blk_queue_sq_sched(q)) sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; /* * If there is already a run_work pending, leave the * pending delay untouched. Otherwise, a hctx can stall * if another hctx is re-delaying the other's work * before the work executes. */ if (delayed_work_pending(&hctx->run_work)) continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the * scheduler. */ if (!sq_hctx || sq_hctx == hctx || !list_empty_careful(&hctx->dispatch)) blk_mq_delay_run_hw_queue(hctx, msecs); } } EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); /* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queue() returns. Please use * blk_mq_quiesce_queue() for that requirement. */ void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { cancel_delayed_work(&hctx->run_work); set_bit(BLK_MQ_S_STOPPED, &hctx->state); } EXPORT_SYMBOL(blk_mq_stop_hw_queue); /* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queues() returns. Please use * blk_mq_quiesce_queue() for that requirement. */ void blk_mq_stop_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_stop_hw_queue(hctx); } EXPORT_SYMBOL(blk_mq_stop_hw_queues); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL(blk_mq_start_hw_queue); void blk_mq_start_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_hw_queue(hctx); } EXPORT_SYMBOL(blk_mq_start_hw_queues); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { if (!blk_mq_hctx_stopped(hctx)) return; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); /* * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch * list in the subsequent routine. */ smp_mb__after_atomic(); blk_mq_run_hw_queue(hctx, async); } EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_stopped_hw_queue(hctx, async || (hctx->flags & BLK_MQ_F_BLOCKING)); } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); static void blk_mq_run_work_fn(struct work_struct *work) { struct blk_mq_hw_ctx *hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); blk_mq_run_dispatch_ops(hctx->queue, blk_mq_sched_dispatch_requests(hctx)); } /** * blk_mq_request_bypass_insert - Insert a request at dispatch list. * @rq: Pointer to request to be inserted. * @flags: BLK_MQ_INSERT_* * * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; spin_lock(&hctx->lock); if (flags & BLK_MQ_INSERT_AT_HEAD) list_add(&rq->queuelist, &hctx->dispatch); else list_add_tail(&rq->queuelist, &hctx->dispatch); spin_unlock(&hctx->lock); } static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async) { struct request *rq; enum hctx_type type = hctx->type; /* * Try to issue requests directly if the hw queue isn't busy to save an * extra enqueue & dequeue to the sw queue. */ if (!hctx->dispatch_busy && !run_queue_async) { blk_mq_run_dispatch_ops(hctx->queue, blk_mq_try_issue_list_directly(hctx, list)); if (list_empty(list)) goto out; } /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now */ list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); trace_block_rq_insert(rq); if (rq->cmd_flags & REQ_NOWAIT) run_queue_async = true; } spin_lock(&ctx->lock); list_splice_tail_init(list, &ctx->rq_lists[type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); out: blk_mq_run_hw_queue(hctx, run_queue_async); } static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) { struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (blk_rq_is_passthrough(rq)) { /* * Passthrough request have to be added to hctx->dispatch * directly. The device may be in a situation where it can't * handle FS request, and always returns BLK_STS_RESOURCE for * them, which gets them added to hctx->dispatch. * * If a passthrough request is required to unblock the queues, * and it is added to the scheduler queue, there is no chance to * dispatch it given we prioritize requests in hctx->dispatch. */ blk_mq_request_bypass_insert(rq, flags); } else if (req_op(rq) == REQ_OP_FLUSH) { /* * Firstly normal IO request is inserted to scheduler queue or * sw queue, meantime we add flush request to dispatch queue( * hctx->dispatch) directly and there is at most one in-flight * flush request for each hw queue, so it doesn't matter to add * flush request to tail or front of the dispatch queue. * * Secondly in case of NCQ, flush request belongs to non-NCQ * command, and queueing it will fail when there is any * in-flight normal IO request(NCQ command). When adding flush * rq to the front of hctx->dispatch, it is easier to introduce * extra time to flush rq's latency because of S_SCHED_RESTART * compared with adding to the tail of dispatch queue, then * chance of flush merge is increased, and less flush requests * will be issued to controller. It is observed that ~10% time * is saved in blktests block/004 on disk attached to AHCI/NCQ * drive when adding flush rq to the front of hctx->dispatch. * * Simply queue flush rq to the front of hctx->dispatch so that * intensive flush workloads can benefit in case of NCQ HW. */ blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD); } else if (q->elevator) { LIST_HEAD(list); WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG); list_add(&rq->queuelist, &list); q->elevator->type->ops.insert_requests(hctx, &list, flags); } else { trace_block_rq_insert(rq); spin_lock(&ctx->lock); if (flags & BLK_MQ_INSERT_AT_HEAD) list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]); else list_add_tail(&rq->queuelist, &ctx->rq_lists[hctx->type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); } } static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, unsigned int nr_segs) { int err; if (bio->bi_opf & REQ_RAHEAD) rq->cmd_flags |= REQ_FAILFAST_MASK; rq->bio = rq->biotail = bio; rq->__sector = bio->bi_iter.bi_sector; rq->__data_len = bio->bi_iter.bi_size; rq->phys_gap_bit = bio->bi_bvec_gap_bit; rq->nr_phys_segments = nr_segs; if (bio_integrity(bio)) rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, bio); /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); WARN_ON_ONCE(err); blk_account_io_start(rq); } static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, bool last) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, .last = last, }; blk_status_t ret; /* * For OK queue, we are done. For error, caller may kill it. * Any other error (busy), just add it to our list as we * previously would have done. */ ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: blk_mq_update_dispatch_busy(hctx, false); break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_update_dispatch_busy(hctx, true); __blk_mq_requeue_request(rq); break; default: blk_mq_update_dispatch_busy(hctx, false); break; } return ret; } static bool blk_mq_get_budget_and_tag(struct request *rq) { int budget_token; budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) return false; blk_mq_set_rq_budget_token(rq, budget_token); if (!blk_mq_get_driver_tag(rq)) { blk_mq_put_dispatch_budget(rq->q, budget_token); return false; } return true; } /** * blk_mq_try_issue_directly - Try to send a request directly to device driver. * @hctx: Pointer of the associated hardware queue. * @rq: Pointer to request to be sent. * * If the device has enough resources to accept a new request now, send the * request directly to device driver. Else, insert at hctx->dispatch queue, so * we can try send it another time in the future. Requests inserted at this * queue have higher priority. */ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_status_t ret; if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, false); return; } if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT); return; } ret = __blk_mq_issue_directly(hctx, rq, true); switch (ret) { case BLK_STS_OK: break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); blk_mq_run_hw_queue(hctx, false); break; default: blk_mq_end_request(rq, ret); break; } } static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, false); return BLK_STS_OK; } if (!blk_mq_get_budget_and_tag(rq)) return BLK_STS_RESOURCE; return __blk_mq_issue_directly(hctx, rq, last); } static void blk_mq_issue_direct(struct rq_list *rqs) { struct blk_mq_hw_ctx *hctx = NULL; struct request *rq; int queued = 0; blk_status_t ret = BLK_STS_OK; while ((rq = rq_list_pop(rqs))) { bool last = rq_list_empty(rqs); if (hctx != rq->mq_hctx) { if (hctx) { blk_mq_commit_rqs(hctx, queued, false); queued = 0; } hctx = rq->mq_hctx; } ret = blk_mq_request_issue_directly(rq, last); switch (ret) { case BLK_STS_OK: queued++; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); blk_mq_run_hw_queue(hctx, false); goto out; default: blk_mq_end_request(rq, ret); break; } } out: if (ret != BLK_STS_OK) blk_mq_commit_rqs(hctx, queued, false); } static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs) { if (blk_queue_quiesced(q)) return; q->mq_ops->queue_rqs(rqs); } static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs, struct rq_list *queue_rqs) { struct request *rq = rq_list_pop(rqs); struct request_queue *this_q = rq->q; struct request **prev = &rqs->head; struct rq_list matched_rqs = {}; struct request *last = NULL; unsigned depth = 1; rq_list_add_tail(&matched_rqs, rq); while ((rq = *prev)) { if (rq->q == this_q) { /* move rq from rqs to matched_rqs */ *prev = rq->rq_next; rq_list_add_tail(&matched_rqs, rq); depth++; } else { /* leave rq in rqs */ prev = &rq->rq_next; last = rq; } } rqs->tail = last; *queue_rqs = matched_rqs; return depth; } static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth) { struct request_queue *q = rq_list_peek(rqs)->q; trace_block_unplug(q, depth, true); /* * Peek first request and see if we have a ->queue_rqs() hook. * If we do, we can dispatch the whole list in one go. * We already know at this point that all requests belong to the * same queue, caller must ensure that's the case. */ if (q->mq_ops->queue_rqs) { blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs)); if (rq_list_empty(rqs)) return; } blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs)); } static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched) { struct blk_mq_hw_ctx *this_hctx = NULL; struct blk_mq_ctx *this_ctx = NULL; struct rq_list requeue_list = {}; unsigned int depth = 0; bool is_passthrough = false; LIST_HEAD(list); do { struct request *rq = rq_list_pop(rqs); if (!this_hctx) { this_hctx = rq->mq_hctx; this_ctx = rq->mq_ctx; is_passthrough = blk_rq_is_passthrough(rq); } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || is_passthrough != blk_rq_is_passthrough(rq)) { rq_list_add_tail(&requeue_list, rq); continue; } list_add_tail(&rq->queuelist, &list); depth++; } while (!rq_list_empty(rqs)); *rqs = requeue_list; trace_block_unplug(this_hctx->queue, depth, !from_sched); percpu_ref_get(&this_hctx->queue->q_usage_counter); /* passthrough requests should never be issued to the I/O scheduler */ if (is_passthrough) { spin_lock(&this_hctx->lock); list_splice_tail_init(&list, &this_hctx->dispatch); spin_unlock(&this_hctx->lock); blk_mq_run_hw_queue(this_hctx, from_sched); } else if (this_hctx->queue->elevator) { this_hctx->queue->elevator->type->ops.insert_requests(this_hctx, &list, 0); blk_mq_run_hw_queue(this_hctx, from_sched); } else { blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched); } percpu_ref_put(&this_hctx->queue->q_usage_counter); } static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs) { do { struct rq_list queue_rqs; unsigned depth; depth = blk_mq_extract_queue_requests(rqs, &queue_rqs); blk_mq_dispatch_queue_requests(&queue_rqs, depth); while (!rq_list_empty(&queue_rqs)) blk_mq_dispatch_list(&queue_rqs, false); } while (!rq_list_empty(rqs)); } void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { unsigned int depth; /* * We may have been called recursively midway through handling * plug->mq_list via a schedule() in the driver's queue_rq() callback. * To avoid mq_list changing under our feet, clear rq_count early and * bail out specifically if rq_count is 0 rather than checking * whether the mq_list is empty. */ if (plug->rq_count == 0) return; depth = plug->rq_count; plug->rq_count = 0; if (!plug->has_elevator && !from_schedule) { if (plug->multiple_queues) { blk_mq_dispatch_multiple_queue_requests(&plug->mq_list); return; } blk_mq_dispatch_queue_requests(&plug->mq_list, depth); if (rq_list_empty(&plug->mq_list)) return; } do { blk_mq_dispatch_list(&plug->mq_list, from_schedule); } while (!rq_list_empty(&plug->mq_list)); } static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { int queued = 0; blk_status_t ret = BLK_STS_OK; while (!list_empty(list)) { struct request *rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); ret = blk_mq_request_issue_directly(rq, list_empty(list)); switch (ret) { case BLK_STS_OK: queued++; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); if (list_empty(list)) blk_mq_run_hw_queue(hctx, false); goto out; default: blk_mq_end_request(rq, ret); break; } } out: if (ret != BLK_STS_OK) blk_mq_commit_rqs(hctx, queued, false); } static bool blk_mq_attempt_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { if (!blk_queue_nomerges(q) && bio_mergeable(bio)) { if (blk_attempt_plug_merge(q, bio, nr_segs)) return true; if (blk_mq_sched_bio_merge(q, bio, nr_segs)) return true; } return false; } static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_plug *plug, struct bio *bio) { struct blk_mq_alloc_data data = { .q = q, .flags = 0, .shallow_depth = 0, .cmd_flags = bio->bi_opf, .rq_flags = 0, .nr_tags = 1, .cached_rqs = NULL, .ctx = NULL, .hctx = NULL }; struct request *rq; rq_qos_throttle(q, bio); if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; data.cached_rqs = &plug->cached_rqs; } rq = __blk_mq_alloc_requests(&data); if (unlikely(!rq)) rq_qos_cleanup(q, bio); return rq; } /* * Check if there is a suitable cached request and return it. */ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, struct request_queue *q, blk_opf_t opf) { enum hctx_type type = blk_mq_get_hctx_type(opf); struct request *rq; if (!plug) return NULL; rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; if (type != rq->mq_hctx->type && (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT)) return NULL; if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; return rq; } static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio) { if (rq_list_pop(&plug->cached_rqs) != rq) WARN_ON_ONCE(1); /* * If any qos ->throttle() end up blocking, we will have flushed the * plug and hence killed the cached_rq list as well. Pop this entry * before we throttle. */ rq_qos_throttle(rq->q, bio); blk_mq_rq_time_init(rq, blk_time_get_ns()); rq->cmd_flags = bio->bi_opf; INIT_LIST_HEAD(&rq->queuelist); } static bool bio_unaligned(const struct bio *bio, struct request_queue *q) { unsigned int bs_mask = queue_logical_block_size(q) - 1; /* .bi_sector of any zero sized bio need to be initialized */ if ((bio->bi_iter.bi_size & bs_mask) || ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask)) return true; return false; } /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. * * Builds up a request structure from @q and @bio and send to the device. The * request may not be queued directly to hardware if: * * This request can be merged with another one * * We want to place request at plug queue for possible future merging * * There is an IO scheduler active at this queue * * It will not queue the request if there is an error with the bio, or at the * request creation. */ void blk_mq_submit_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blk_plug *plug = current->plug; const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; unsigned int nr_segs; struct request *rq; blk_status_t ret; /* * If the plug has a cached request for this queue, try to use it. */ rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); /* * A BIO that was released from a zone write plug has already been * through the preparation in this function, already holds a reference * on the queue usage counter, and is the only write BIO in-flight for * the target zone. Go straight to preparing a request for it. */ if (bio_zone_write_plugging(bio)) { nr_segs = bio->__bi_nr_segments; if (rq) blk_queue_exit(q); goto new_request; } /* * The cached request already holds a q_usage_counter reference and we * don't have to acquire a new one if we use it. */ if (!rq) { if (unlikely(bio_queue_enter(bio))) return; } /* * Device reconfiguration may change logical block size or reduce the * number of poll queues, so the checks for alignment and poll support * have to be done with queue usage counter held. */ if (unlikely(bio_unaligned(bio, q))) { bio_io_error(bio); goto queue_exit; } if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) { bio->bi_status = BLK_STS_NOTSUPP; bio_endio(bio); goto queue_exit; } bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!bio) goto queue_exit; if (!bio_integrity_prep(bio)) goto queue_exit; blk_mq_bio_issue_init(q, bio); if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; if (bio_needs_zone_write_plugging(bio)) { if (blk_zone_plug_bio(bio, nr_segs)) goto queue_exit; } new_request: if (rq) { blk_mq_use_cached_rq(rq, plug, bio); } else { rq = blk_mq_get_new_requests(q, plug, bio); if (unlikely(!rq)) { if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); goto queue_exit; } } trace_block_getrq(bio); rq_qos_track(q, rq, bio); blk_mq_bio_to_request(rq, bio, nr_segs); ret = blk_crypto_rq_get_keyslot(rq); if (ret != BLK_STS_OK) { bio->bi_status = ret; bio_endio(bio); blk_mq_free_request(rq); return; } if (bio_zone_write_plugging(bio)) blk_zone_write_plug_init_request(rq); if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) return; if (plug) { blk_add_rq_to_plug(plug, rq); return; } hctx = rq->mq_hctx; if ((rq->rq_flags & RQF_USE_SCHED) || (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, true); } else { blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq)); } return; queue_exit: /* * Don't drop the queue reference if we were trying to use a cached * request and thus didn't acquire one. */ if (!rq) blk_queue_exit(q); } #ifdef CONFIG_BLK_MQ_STACKING /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request * @rq: the request being queued */ blk_status_t blk_insert_cloned_request(struct request *rq) { struct request_queue *q = rq->q; unsigned int max_sectors = blk_queue_get_max_sectors(rq); unsigned int max_segments = blk_rq_get_max_segments(rq); blk_status_t ret; if (blk_rq_sectors(rq) > max_sectors) { /* * SCSI device does not have a good way to return if * Write Same/Zero is actually supported. If a device rejects * a non-read/write command (discard, write same,etc.) the * low-level device driver will set the relevant queue limit to * 0 to prevent blk-lib from issuing more of the offending * operations. Commands queued prior to the queue limit being * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O * errors being propagated to upper layers. */ if (max_sectors == 0) return BLK_STS_NOTSUPP; printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", __func__, blk_rq_sectors(rq), max_sectors); return BLK_STS_IOERR; } /* * The queue settings related to segment counting may differ from the * original queue. */ rq->nr_phys_segments = blk_recalc_rq_segments(rq); if (rq->nr_phys_segments > max_segments) { printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n", __func__, rq->nr_phys_segments, max_segments); return BLK_STS_IOERR; } if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; ret = blk_crypto_rq_get_keyslot(rq); if (ret != BLK_STS_OK) return ret; blk_account_io_start(rq); /* * Since we have a scheduler attached on the top device, * bypass a potential scheduler on the bottom device for * insert. */ blk_mq_run_dispatch_ops(q, ret = blk_mq_request_issue_directly(rq, true)); if (ret) blk_account_io_done(rq, blk_time_get_ns()); return ret; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); /** * blk_rq_unprep_clone - Helper function to free all bios in a cloned request * @rq: the clone request to be cleaned up * * Description: * Free all bios in @rq for a cloned request. */ void blk_rq_unprep_clone(struct request *rq) { struct bio *bio; while ((bio = rq->bio) != NULL) { rq->bio = bio->bi_next; bio_put(bio); } } EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); /** * blk_rq_prep_clone - Helper function to setup clone request * @rq: the request to be setup * @rq_src: original request to be cloned * @bs: bio_set that bios for clone are allocated from * @gfp_mask: memory allocation mask for bio * @bio_ctr: setup function to be called for each clone bio. * Returns %0 for success, non %0 for failure. * @data: private data to be passed to @bio_ctr * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. * So cloned bios must be completed before original bios, which means * the caller must complete @rq before @rq_src. */ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data) { struct bio *bio_src; if (!bs) bs = &fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { struct bio *bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask, bs); if (!bio) goto free_and_out; if (bio_ctr && bio_ctr(bio, bio_src, data)) { bio_put(bio); goto free_and_out; } if (rq->bio) { rq->biotail->bi_next = bio; rq->biotail = bio; } else { rq->bio = rq->biotail = bio; } } /* Copy attributes of the original request to the clone request. */ rq->__sector = blk_rq_pos(rq_src); rq->__data_len = blk_rq_bytes(rq_src); if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { rq->rq_flags |= RQF_SPECIAL_PAYLOAD; rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->nr_integrity_segments = rq_src->nr_integrity_segments; rq->phys_gap_bit = rq_src->phys_gap_bit; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; return 0; free_and_out: blk_rq_unprep_clone(rq); return -ENOMEM; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); #endif /* CONFIG_BLK_MQ_STACKING */ /* * Steal bios from a request and add them to a bio list. * The request must not have been partially completed before. */ void blk_steal_bios(struct bio_list *list, struct request *rq) { if (rq->bio) { if (list->tail) list->tail->bi_next = rq->bio; else list->head = rq->bio; list->tail = rq->biotail; rq->bio = NULL; rq->biotail = NULL; } rq->__data_len = 0; } EXPORT_SYMBOL_GPL(blk_steal_bios); static size_t order_to_size(unsigned int order) { return (size_t)PAGE_SIZE << order; } /* called before freeing request pool in @tags */ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, struct blk_mq_tags *tags) { struct page *page; /* * There is no need to clear mapping if driver tags is not initialized * or the mapping belongs to the driver tags. */ if (!drv_tags || drv_tags == tags) return; list_for_each_entry(page, &tags->page_list, lru) { unsigned long start = (unsigned long)page_address(page); unsigned long end = start + order_to_size(page->private); int i; for (i = 0; i < drv_tags->nr_tags; i++) { struct request *rq = drv_tags->rqs[i]; unsigned long rq_addr = (unsigned long)rq; if (rq_addr >= start && rq_addr < end) { WARN_ON_ONCE(req_ref_read(rq) != 0); cmpxchg(&drv_tags->rqs[i], rq, NULL); } } } } void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { struct blk_mq_tags *drv_tags; if (list_empty(&tags->page_list)) return; if (blk_mq_is_shared_tags(set->flags)) drv_tags = set->shared_tags; else drv_tags = set->tags[hctx_idx]; if (tags->static_rqs && set->ops->exit_request) { int i; for (i = 0; i < tags->nr_tags; i++) { struct request *rq = tags->static_rqs[i]; if (!rq) continue; set->ops->exit_request(set, rq, hctx_idx); tags->static_rqs[i] = NULL; } } blk_mq_clear_rq_mapping(drv_tags, tags); /* * Free request pages in SRCU callback, which is called from * blk_mq_free_tags(). */ } void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) { kfree(tags->rqs); tags->rqs = NULL; kfree(tags->static_rqs); tags->static_rqs = NULL; blk_mq_free_tags(set, tags); } static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, unsigned int hctx_idx) { int i; for (i = 0; i < set->nr_maps; i++) { unsigned int start = set->map[i].queue_offset; unsigned int end = start + set->map[i].nr_queues; if (hctx_idx >= start && hctx_idx < end) break; } if (i >= set->nr_maps) i = HCTX_TYPE_DEFAULT; return i; } static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set, unsigned int hctx_idx) { enum hctx_type type = hctx_idx_to_type(set, hctx_idx); return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx); } static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int nr_tags, unsigned int reserved_tags) { int node = blk_mq_get_hctx_node(set, hctx_idx); struct blk_mq_tags *tags; if (node == NUMA_NO_NODE) node = set->numa_node; tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node); if (!tags) return NULL; tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->rqs) goto err_free_tags; tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->static_rqs) goto err_free_rqs; return tags; err_free_rqs: kfree(tags->rqs); err_free_tags: blk_mq_free_tags(set, tags); return NULL; } static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, int node) { int ret; if (set->ops->init_request) { ret = set->ops->init_request(set, rq, hctx_idx, node); if (ret) return ret; } WRITE_ONCE(rq->state, MQ_RQ_IDLE); return 0; } static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx, unsigned int depth) { unsigned int i, j, entries_per_page, max_order = 4; int node = blk_mq_get_hctx_node(set, hctx_idx); size_t rq_size, left; if (node == NUMA_NO_NODE) node = set->numa_node; /* * rq_size is the size of the request plus driver payload, rounded * to the cacheline size */ rq_size = round_up(sizeof(struct request) + set->cmd_size, cache_line_size()); left = rq_size * depth; for (i = 0; i < depth; ) { int this_order = max_order; struct page *page; int to_do; void *p; while (this_order && left < order_to_size(this_order - 1)) this_order--; do { page = alloc_pages_node(node, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, this_order); if (page) break; if (!this_order--) break; if (order_to_size(this_order) < rq_size) break; } while (1); if (!page) goto fail; page->private = this_order; list_add_tail(&page->lru, &tags->page_list); p = page_address(page); /* * Allow kmemleak to scan these pages as they contain pointers * to additional allocations like via ops->init_request(). */ kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); entries_per_page = order_to_size(this_order) / rq_size; to_do = min(entries_per_page, depth - i); left -= to_do * rq_size; for (j = 0; j < to_do; j++) { struct request *rq = p; tags->static_rqs[i] = rq; if (blk_mq_init_request(set, rq, hctx_idx, node)) { tags->static_rqs[i] = NULL; goto fail; } p += rq_size; i++; } } return 0; fail: blk_mq_free_rqs(set, tags, hctx_idx); return -ENOMEM; } struct rq_iter_data { struct blk_mq_hw_ctx *hctx; bool has_rq; }; static bool blk_mq_has_request(struct request *rq, void *data) { struct rq_iter_data *iter_data = data; if (rq->mq_hctx != iter_data->hctx) return true; iter_data->has_rq = true; return false; } static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->sched_tags ? hctx->sched_tags : hctx->tags; struct rq_iter_data data = { .hctx = hctx, }; int srcu_idx; srcu_idx = srcu_read_lock(&hctx->queue->tag_set->tags_srcu); blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); srcu_read_unlock(&hctx->queue->tag_set->tags_srcu, srcu_idx); return data.has_rq; } static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx, unsigned int this_cpu) { enum hctx_type type = hctx->type; int cpu; /* * hctx->cpumask has to rule out isolated CPUs, but userspace still * might submit IOs on these isolated CPUs, so use the queue map to * check if all CPUs mapped to this hctx are offline */ for_each_online_cpu(cpu) { struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue, type, cpu); if (h != hctx) continue; /* this hctx has at least one online CPU */ if (this_cpu != cpu) return true; } return false; } static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online); int ret = 0; if (blk_mq_hctx_has_online_cpu(hctx, cpu)) return 0; /* * Prevent new request from being allocated on the current hctx. * * The smp_mb__after_atomic() Pairs with the implied barrier in * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is * seen once we return from the tag allocator. */ set_bit(BLK_MQ_S_INACTIVE, &hctx->state); smp_mb__after_atomic(); /* * Try to grab a reference to the queue and wait for any outstanding * requests. If we could not grab a reference the queue has been * frozen and there are no requests. */ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { while (blk_mq_hctx_has_requests(hctx)) { /* * The wakeup capable IRQ handler of block device is * not called during suspend. Skip the loop by checking * pm_wakeup_pending to prevent the deadlock and improve * suspend latency. */ if (pm_wakeup_pending()) { clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); ret = -EBUSY; break; } msleep(5); } percpu_ref_put(&hctx->queue->q_usage_counter); } return ret; } /* * Check if one CPU is mapped to the specified hctx * * Isolated CPUs have been ruled out from hctx->cpumask, which is supposed * to be used for scheduling kworker only. For other usage, please call this * helper for checking if one CPU belongs to the specified hctx */ static bool blk_mq_cpu_mapped_to_hctx(unsigned int cpu, const struct blk_mq_hw_ctx *hctx) { struct blk_mq_hw_ctx *mapped_hctx = blk_mq_map_queue_type(hctx->queue, hctx->type, cpu); return mapped_hctx == hctx; } static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online); if (blk_mq_cpu_mapped_to_hctx(cpu, hctx)) clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); return 0; } /* * 'cpu' is going away. splice any existing rq_list entries from this * software queue to the hw queue dispatch list, and ensure that it * gets run. */ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); enum hctx_type type; hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); if (!blk_mq_cpu_mapped_to_hctx(cpu, hctx)) return 0; ctx = __blk_mq_get_ctx(hctx->queue, cpu); type = hctx->type; spin_lock(&ctx->lock); if (!list_empty(&ctx->rq_lists[type])) { list_splice_init(&ctx->rq_lists[type], &tmp); blk_mq_hctx_clear_pending(hctx, ctx); } spin_unlock(&ctx->lock); if (list_empty(&tmp)) return 0; spin_lock(&hctx->lock); list_splice_tail_init(&tmp, &hctx->dispatch); spin_unlock(&hctx->lock); blk_mq_run_hw_queue(hctx, true); return 0; } static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { lockdep_assert_held(&blk_mq_cpuhp_lock); if (!(hctx->flags & BLK_MQ_F_STACKING) && !hlist_unhashed(&hctx->cpuhp_online)) { cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, &hctx->cpuhp_online); INIT_HLIST_NODE(&hctx->cpuhp_online); } if (!hlist_unhashed(&hctx->cpuhp_dead)) { cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); INIT_HLIST_NODE(&hctx->cpuhp_dead); } } static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { mutex_lock(&blk_mq_cpuhp_lock); __blk_mq_remove_cpuhp(hctx); mutex_unlock(&blk_mq_cpuhp_lock); } static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx) { lockdep_assert_held(&blk_mq_cpuhp_lock); if (!(hctx->flags & BLK_MQ_F_STACKING) && hlist_unhashed(&hctx->cpuhp_online)) cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, &hctx->cpuhp_online); if (hlist_unhashed(&hctx->cpuhp_dead)) cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); } static void __blk_mq_remove_cpuhp_list(struct list_head *head) { struct blk_mq_hw_ctx *hctx; lockdep_assert_held(&blk_mq_cpuhp_lock); list_for_each_entry(hctx, head, hctx_list) __blk_mq_remove_cpuhp(hctx); } /* * Unregister cpuhp callbacks from exited hw queues * * Safe to call if this `request_queue` is live */ static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q) { LIST_HEAD(hctx_list); spin_lock(&q->unused_hctx_lock); list_splice_init(&q->unused_hctx_list, &hctx_list); spin_unlock(&q->unused_hctx_lock); mutex_lock(&blk_mq_cpuhp_lock); __blk_mq_remove_cpuhp_list(&hctx_list); mutex_unlock(&blk_mq_cpuhp_lock); spin_lock(&q->unused_hctx_lock); list_splice(&hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); } /* * Register cpuhp callbacks from all hw queues * * Safe to call if this `request_queue` is live */ static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; mutex_lock(&blk_mq_cpuhp_lock); queue_for_each_hw_ctx(q, hctx, i) __blk_mq_add_cpuhp(hctx); mutex_unlock(&blk_mq_cpuhp_lock); } /* * Before freeing hw queue, clearing the flush request reference in * tags->rqs[] for avoiding potential UAF. */ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, unsigned int queue_depth, struct request *flush_rq) { int i; /* The hw queue may not be mapped yet */ if (!tags) return; WARN_ON_ONCE(req_ref_read(flush_rq) != 0); for (i = 0; i < queue_depth; i++) cmpxchg(&tags->rqs[i], flush_rq, NULL); } static void blk_free_flush_queue_callback(struct rcu_head *head) { struct blk_flush_queue *fq = container_of(head, struct blk_flush_queue, rcu_head); blk_free_flush_queue(fq); } /* hctx->ctxs will be freed in queue's release handler */ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { struct request *flush_rq = hctx->fq->flush_rq; if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); if (blk_queue_init_done(q)) blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], set->queue_depth, flush_rq); if (set->ops->exit_request) set->ops->exit_request(set, flush_rq, hctx_idx); if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); call_srcu(&set->tags_srcu, &hctx->fq->rcu_head, blk_free_flush_queue_callback); hctx->fq = NULL; spin_lock(&q->unused_hctx_lock); list_add(&hctx->hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); } static void blk_mq_exit_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set, int nr_queue) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; blk_mq_remove_cpuhp(hctx); blk_mq_exit_hctx(q, set, hctx, i); } } static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); if (!hctx->fq) goto fail; hctx->queue_num = hctx_idx; hctx->tags = set->tags[hctx_idx]; if (set->ops->init_hctx && set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) goto fail_free_fq; if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) goto exit_hctx; return 0; exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); fail_free_fq: blk_free_flush_queue(hctx->fq); hctx->fq = NULL; fail: return -1; } static struct blk_mq_hw_ctx * blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, int node) { struct blk_mq_hw_ctx *hctx; gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node); if (!hctx) goto fail_alloc_hctx; if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node)) goto free_hctx; atomic_set(&hctx->nr_active, 0); if (node == NUMA_NO_NODE) node = set->numa_node; hctx->numa_node = node; INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); INIT_HLIST_NODE(&hctx->cpuhp_dead); INIT_HLIST_NODE(&hctx->cpuhp_online); hctx->queue = q; hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; INIT_LIST_HEAD(&hctx->hctx_list); /* * Allocate space for all possible cpus to avoid allocation at * runtime */ hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), gfp, node); if (!hctx->ctxs) goto free_cpumask; if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), gfp, node, false, false)) goto free_ctxs; hctx->nr_ctx = 0; spin_lock_init(&hctx->dispatch_wait_lock); init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); blk_mq_hctx_kobj_init(hctx); return hctx; free_ctxs: kfree(hctx->ctxs); free_cpumask: free_cpumask_var(hctx->cpumask); free_hctx: kfree(hctx); fail_alloc_hctx: return NULL; } static void blk_mq_init_cpu_queues(struct request_queue *q, unsigned int nr_hw_queues) { struct blk_mq_tag_set *set = q->tag_set; unsigned int i, j; for_each_possible_cpu(i) { struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); struct blk_mq_hw_ctx *hctx; int k; __ctx->cpu = i; spin_lock_init(&__ctx->lock); for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) INIT_LIST_HEAD(&__ctx->rq_lists[k]); __ctx->queue = q; /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device */ for (j = 0; j < set->nr_maps; j++) { hctx = blk_mq_map_queue_type(q, j, i); if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) hctx->numa_node = cpu_to_node(i); } } } struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth) { struct blk_mq_tags *tags; int ret; tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags); if (!tags) return NULL; ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); if (ret) { blk_mq_free_rq_map(set, tags); return NULL; } return tags; } static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, int hctx_idx) { if (blk_mq_is_shared_tags(set->flags)) { set->tags[hctx_idx] = set->shared_tags; return true; } set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx, set->queue_depth); return set->tags[hctx_idx]; } void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { if (tags) { blk_mq_free_rqs(set, tags, hctx_idx); blk_mq_free_rq_map(set, tags); } } static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx) { if (!blk_mq_is_shared_tags(set->flags)) blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx); set->tags[hctx_idx] = NULL; } static void blk_mq_map_swqueue(struct request_queue *q) { unsigned int j, hctx_idx; unsigned long i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; hctx->dispatch_from = NULL; } /* * Map software to hardware queues. * * If the cpu isn't present, the cpu is mapped to first hctx. */ for_each_possible_cpu(i) { ctx = per_cpu_ptr(q->queue_ctx, i); for (j = 0; j < set->nr_maps; j++) { if (!set->map[j].nr_queues) { ctx->hctxs[j] = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, i); continue; } hctx_idx = set->map[j].mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) { /* * If tags initialization fail for some hctx, * that hctx won't be brought online. In this * case, remap the current ctx to hctx[0] which * is guaranteed to always have tags allocated */ set->map[j].mq_map[i] = 0; } hctx = blk_mq_map_queue_type(q, j, i); ctx->hctxs[j] = hctx; /* * If the CPU is already set in the mask, then we've * mapped this one already. This can happen if * devices share queues across queue maps. */ if (cpumask_test_cpu(i, hctx->cpumask)) continue; cpumask_set_cpu(i, hctx->cpumask); hctx->type = j; ctx->index_hw[hctx->type] = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; /* * If the nr_ctx type overflows, we have exceeded the * amount of sw queues we can support. */ BUG_ON(!hctx->nr_ctx); } for (; j < HCTX_MAX_TYPES; j++) ctx->hctxs[j] = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, i); } queue_for_each_hw_ctx(q, hctx, i) { int cpu; /* * If no software queues are mapped to this hardware queue, * disable it and free the request entries. */ if (!hctx->nr_ctx) { /* Never unmap queue 0. We need it as a * fallback in case of a new remap fails * allocation */ if (i) __blk_mq_free_map_and_rqs(set, i); hctx->tags = NULL; continue; } hctx->tags = set->tags[i]; WARN_ON(!hctx->tags); /* * Set the map size to the number of mapped software queues. * This is more accurate and more efficient than looping * over all possibly mapped software queues. */ sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); /* * Rule out isolated CPUs from hctx->cpumask to avoid * running block kworker on isolated CPUs */ for_each_cpu(cpu, hctx->cpumask) { if (cpu_is_isolated(cpu)) cpumask_clear_cpu(cpu, hctx->cpumask); } /* * Initialize batch roundrobin counts */ hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } } /* * Caller needs to ensure that we're either frozen/quiesced, or that * the queue isn't live yet. */ static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (shared) { hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; } else { blk_mq_tag_idle(hctx); hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; } } } static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, bool shared) { struct request_queue *q; unsigned int memflags; lockdep_assert_held(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { memflags = blk_mq_freeze_queue(q); queue_set_hctx_shared(q, shared); blk_mq_unfreeze_queue(q, memflags); } } static void blk_mq_del_queue_tag_set(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; mutex_lock(&set->tag_list_lock); list_del_rcu(&q->tag_set_list); if (list_is_singular(&set->tag_list)) { /* just transitioned to unshared */ set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ blk_mq_update_tag_set_shared(set, false); } mutex_unlock(&set->tag_list_lock); } static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, struct request_queue *q) { mutex_lock(&set->tag_list_lock); /* * Check to see if we're transitioning to shared (from 1 to 2 queues). */ if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ blk_mq_update_tag_set_shared(set, true); } if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED) queue_set_hctx_shared(q, true); list_add_tail_rcu(&q->tag_set_list, &set->tag_list); mutex_unlock(&set->tag_list_lock); } /* All allocations will be freed in release handler of q->mq_kobj */ static int blk_mq_alloc_ctxs(struct request_queue *q) { struct blk_mq_ctxs *ctxs; int cpu; ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); if (!ctxs) return -ENOMEM; ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); if (!ctxs->queue_ctx) goto fail; for_each_possible_cpu(cpu) { struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); ctx->ctxs = ctxs; } q->mq_kobj = &ctxs->kobj; q->queue_ctx = ctxs->queue_ctx; return 0; fail: kfree(ctxs); return -ENOMEM; } /* * It is the actual release handler for mq, but we do it from * request queue's release handler for avoiding use-after-free * and headache because q->mq_kobj shouldn't have been introduced, * but we can't group ctx/kctx kobj without it. */ void blk_mq_release(struct request_queue *q) { struct blk_mq_hw_ctx *hctx, *next; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); /* all hctx are in .unused_hctx_list now */ list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) { list_del_init(&hctx->hctx_list); kobject_put(&hctx->kobj); } kfree(q->queue_hw_ctx); /* * release .mq_kobj and sw queue's kobject now because * both share lifetime with request queue. */ blk_mq_sysfs_deinit(q); } struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata) { struct queue_limits default_lim = { }; struct request_queue *q; int ret; if (!lim) lim = &default_lim; lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; if (set->nr_maps > HCTX_TYPE_POLL) lim->features |= BLK_FEAT_POLL; q = blk_alloc_queue(lim, set->numa_node); if (IS_ERR(q)) return q; q->queuedata = queuedata; ret = blk_mq_init_allocated_queue(set, q); if (ret) { blk_put_queue(q); return ERR_PTR(ret); } return q; } EXPORT_SYMBOL(blk_mq_alloc_queue); /** * blk_mq_destroy_queue - shutdown a request queue * @q: request queue to shutdown * * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future * requests will be failed with -ENODEV. The caller is responsible for dropping * the reference from blk_mq_alloc_queue() by calling blk_put_queue(). * * Context: can sleep */ void blk_mq_destroy_queue(struct request_queue *q) { WARN_ON_ONCE(!queue_is_mq(q)); WARN_ON_ONCE(blk_queue_registered(q)); might_sleep(); blk_queue_flag_set(QUEUE_FLAG_DYING, q); blk_queue_start_drain(q); blk_mq_freeze_queue_wait(q); blk_sync_queue(q); blk_mq_cancel_work_sync(q); blk_mq_exit_queue(q); } EXPORT_SYMBOL(blk_mq_destroy_queue); struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; q = blk_mq_alloc_queue(set, lim, queuedata); if (IS_ERR(q)) return ERR_CAST(q); disk = __alloc_disk_node(q, set->numa_node, lkclass); if (!disk) { blk_mq_destroy_queue(q); blk_put_queue(q); return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); return disk; } EXPORT_SYMBOL(__blk_mq_alloc_disk); struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass) { struct gendisk *disk; if (!blk_get_queue(q)) return NULL; disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass); if (!disk) blk_put_queue(q); return disk; } EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue); /* * Only hctx removed from cpuhp list can be reused */ static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx) { return hlist_unhashed(&hctx->cpuhp_online) && hlist_unhashed(&hctx->cpuhp_dead); } static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) { struct blk_mq_hw_ctx *hctx = NULL, *tmp; /* reuse dead hctx first */ spin_lock(&q->unused_hctx_lock); list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) { hctx = tmp; break; } } if (hctx) list_del_init(&hctx->hctx_list); spin_unlock(&q->unused_hctx_lock); if (!hctx) hctx = blk_mq_alloc_hctx(q, set, node); if (!hctx) goto fail; if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) goto free_hctx; return hctx; free_hctx: kobject_put(&hctx->kobj); fail: return NULL; } static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { int i, j, end; struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; if (q->nr_hw_queues < set->nr_hw_queues) { struct blk_mq_hw_ctx **new_hctxs; new_hctxs = kcalloc_node(set->nr_hw_queues, sizeof(*new_hctxs), GFP_KERNEL, set->numa_node); if (!new_hctxs) return; if (hctxs) memcpy(new_hctxs, hctxs, q->nr_hw_queues * sizeof(*hctxs)); rcu_assign_pointer(q->queue_hw_ctx, new_hctxs); /* * Make sure reading the old queue_hw_ctx from other * context concurrently won't trigger uaf. */ synchronize_rcu_expedited(); kfree(hctxs); hctxs = new_hctxs; } for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); struct blk_mq_hw_ctx *old_hctx = hctxs[i]; if (old_hctx) { old_node = old_hctx->numa_node; blk_mq_exit_hctx(q, set, old_hctx, i); } hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node); if (!hctxs[i]) { if (!old_hctx) break; pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", node, old_node); hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, old_node); WARN_ON_ONCE(!hctxs[i]); } } /* * Increasing nr_hw_queues fails. Free the newly allocated * hctxs and keep the previous q->nr_hw_queues. */ if (i != set->nr_hw_queues) { j = q->nr_hw_queues; end = i; } else { j = i; end = q->nr_hw_queues; q->nr_hw_queues = set->nr_hw_queues; } for (; j < end; j++) { struct blk_mq_hw_ctx *hctx = hctxs[j]; if (hctx) { blk_mq_exit_hctx(q, set, hctx, j); hctxs[j] = NULL; } } } static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { __blk_mq_realloc_hw_ctxs(set, q); /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); /* register cpuhp for new initialized hctxs */ blk_mq_add_hw_queues_cpuhp(q); } int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { /* mark the queue as mq asap */ q->mq_ops = set->ops; /* * ->tag_set has to be setup before initialize hctx, which cpuphp * handler needs it for checking queue mapping */ q->tag_set = set; if (blk_mq_alloc_ctxs(q)) goto err_exit; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->flush_list); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); q->nr_requests = set->queue_depth; blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_map_swqueue(q); blk_mq_add_queue_tag_set(set, q); return 0; err_hctxs: blk_mq_release(q); err_exit: q->mq_ops = NULL; return -ENOMEM; } EXPORT_SYMBOL(blk_mq_init_allocated_queue); /* tags can _not_ be used after returning from blk_mq_exit_queue */ void blk_mq_exit_queue(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */ blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */ blk_mq_del_queue_tag_set(q); } static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; if (blk_mq_is_shared_tags(set->flags)) { set->shared_tags = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX, set->queue_depth); if (!set->shared_tags) return -ENOMEM; } for (i = 0; i < set->nr_hw_queues; i++) { if (!__blk_mq_alloc_map_and_rqs(set, i)) goto out_unwind; cond_resched(); } return 0; out_unwind: while (--i >= 0) __blk_mq_free_map_and_rqs(set, i); if (blk_mq_is_shared_tags(set->flags)) { blk_mq_free_map_and_rqs(set, set->shared_tags, BLK_MQ_NO_HCTX_IDX); } return -ENOMEM; } /* * Allocate the request maps associated with this tag_set. Note that this * may reduce the depth asked for, if memory is tight. set->queue_depth * will be updated to reflect the allocated depth. */ static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) { unsigned int depth; int err; depth = set->queue_depth; do { err = __blk_mq_alloc_rq_maps(set); if (!err) break; set->queue_depth >>= 1; if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { err = -ENOMEM; break; } } while (set->queue_depth); if (!set->queue_depth || err) { pr_err("blk-mq: failed to allocate request map\n"); return -ENOMEM; } if (depth != set->queue_depth) pr_info("blk-mq: reduced tag depth (%u -> %u)\n", depth, set->queue_depth); return 0; } static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) { /* * blk_mq_map_queues() and multiple .map_queues() implementations * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the * number of hardware queues. */ if (set->nr_maps == 1) set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; if (set->ops->map_queues) { int i; /* * transport .map_queues is usually done in the following * way: * * for (queue = 0; queue < set->nr_hw_queues; queue++) { * mask = get_cpu_mask(queue) * for_each_cpu(cpu, mask) * set->map[x].mq_map[cpu] = queue; * } * * When we need to remap, the table has to be cleared for * killing stale mapping since one CPU may not be mapped * to any hw queue. */ for (i = 0; i < set->nr_maps; i++) blk_mq_clear_mq_map(&set->map[i]); set->ops->map_queues(set); } else { BUG_ON(set->nr_maps > 1); blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); } } static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, int new_nr_hw_queues) { struct blk_mq_tags **new_tags; int i; if (set->nr_hw_queues >= new_nr_hw_queues) goto done; new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!new_tags) return -ENOMEM; if (set->tags) memcpy(new_tags, set->tags, set->nr_hw_queues * sizeof(*set->tags)); kfree(set->tags); set->tags = new_tags; for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) { if (!__blk_mq_alloc_map_and_rqs(set, i)) { while (--i >= set->nr_hw_queues) __blk_mq_free_map_and_rqs(set, i); return -ENOMEM; } cond_resched(); } done: set->nr_hw_queues = new_nr_hw_queues; return 0; } /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the * requested depth down, if it's too large. In that case, the set * value will be stored in set->queue_depth. */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { int i, ret; BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); if (!set->nr_hw_queues) return -EINVAL; if (!set->queue_depth) return -EINVAL; if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) return -EINVAL; if (!set->ops->queue_rq) return -EINVAL; if (!set->ops->get_budget ^ !set->ops->put_budget) return -EINVAL; if (set->queue_depth > BLK_MQ_MAX_DEPTH) { pr_info("blk-mq: reduced tag depth to %u\n", BLK_MQ_MAX_DEPTH); set->queue_depth = BLK_MQ_MAX_DEPTH; } if (!set->nr_maps) set->nr_maps = 1; else if (set->nr_maps > HCTX_MAX_TYPES) return -EINVAL; /* * If a crashdump is active, then we are potentially in a very * memory constrained environment. Limit us to 64 tags to prevent * using too much memory. */ if (is_kdump_kernel()) set->queue_depth = min(64U, set->queue_depth); /* * There is no use for more h/w queues than cpus if we just have * a single map */ if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; if (set->flags & BLK_MQ_F_BLOCKING) { set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL); if (!set->srcu) return -ENOMEM; ret = init_srcu_struct(set->srcu); if (ret) goto out_free_srcu; } ret = init_srcu_struct(&set->tags_srcu); if (ret) goto out_cleanup_srcu; init_rwsem(&set->update_nr_hwq_lock); ret = -ENOMEM; set->tags = kcalloc_node(set->nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) goto out_cleanup_tags_srcu; for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, sizeof(set->map[i].mq_map[0]), GFP_KERNEL, set->numa_node); if (!set->map[i].mq_map) goto out_free_mq_map; set->map[i].nr_queues = set->nr_hw_queues; } blk_mq_update_queue_map(set); ret = blk_mq_alloc_set_map_and_rqs(set); if (ret) goto out_free_mq_map; mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; out_free_mq_map: for (i = 0; i < set->nr_maps; i++) { kfree(set->map[i].mq_map); set->map[i].mq_map = NULL; } kfree(set->tags); set->tags = NULL; out_cleanup_tags_srcu: cleanup_srcu_struct(&set->tags_srcu); out_cleanup_srcu: if (set->flags & BLK_MQ_F_BLOCKING) cleanup_srcu_struct(set->srcu); out_free_srcu: if (set->flags & BLK_MQ_F_BLOCKING) kfree(set->srcu); return ret; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); /* allocate and initialize a tagset for a simple single-queue device */ int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, unsigned int set_flags) { memset(set, 0, sizeof(*set)); set->ops = ops; set->nr_hw_queues = 1; set->nr_maps = 1; set->queue_depth = queue_depth; set->numa_node = NUMA_NO_NODE; set->flags = set_flags; return blk_mq_alloc_tag_set(set); } EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set); void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j; for (i = 0; i < set->nr_hw_queues; i++) __blk_mq_free_map_and_rqs(set, i); if (blk_mq_is_shared_tags(set->flags)) { blk_mq_free_map_and_rqs(set, set->shared_tags, BLK_MQ_NO_HCTX_IDX); } for (j = 0; j < set->nr_maps; j++) { kfree(set->map[j].mq_map); set->map[j].mq_map = NULL; } kfree(set->tags); set->tags = NULL; srcu_barrier(&set->tags_srcu); cleanup_srcu_struct(&set->tags_srcu); if (set->flags & BLK_MQ_F_BLOCKING) { cleanup_srcu_struct(set->srcu); kfree(set->srcu); } } EXPORT_SYMBOL(blk_mq_free_tag_set); struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, struct elevator_tags *et, unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; struct elevator_tags *old_et = NULL; struct blk_mq_hw_ctx *hctx; unsigned long i; blk_mq_quiesce_queue(q); if (blk_mq_is_shared_tags(set->flags)) { /* * Shared tags, for sched tags, we allocate max initially hence * tags can't grow, see blk_mq_alloc_sched_tags(). */ if (q->elevator) blk_mq_tag_update_sched_shared_tags(q, nr); else blk_mq_tag_resize_shared_tags(set, nr); } else if (!q->elevator) { /* * Non-shared hardware tags, nr is already checked from * queue_requests_store() and tags can't grow. */ queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->tags) continue; sbitmap_queue_resize(&hctx->tags->bitmap_tags, nr - hctx->tags->nr_reserved_tags); } } else if (nr <= q->elevator->et->nr_requests) { /* Non-shared sched tags, and tags don't grow. */ queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->sched_tags) continue; sbitmap_queue_resize(&hctx->sched_tags->bitmap_tags, nr - hctx->sched_tags->nr_reserved_tags); } } else { /* Non-shared sched tags, and tags grow */ queue_for_each_hw_ctx(q, hctx, i) hctx->sched_tags = et->tags[i]; old_et = q->elevator->et; q->elevator->et = et; } q->nr_requests = nr; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(q); blk_mq_unquiesce_queue(q); return old_et; } /* * Switch back to the elevator type stored in the xarray. */ static void blk_mq_elv_switch_back(struct request_queue *q, struct xarray *elv_tbl) { struct elv_change_ctx *ctx = xa_load(elv_tbl, q->id); if (WARN_ON_ONCE(!ctx)) return; /* The elv_update_nr_hw_queues unfreezes the queue. */ elv_update_nr_hw_queues(q, ctx); /* Drop the reference acquired in blk_mq_elv_switch_none. */ if (ctx->type) elevator_put(ctx->type); } /* * Stores elevator name and type in ctx and set current elevator to none. */ static int blk_mq_elv_switch_none(struct request_queue *q, struct xarray *elv_tbl) { struct elv_change_ctx *ctx; lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock); /* * Accessing q->elevator without holding q->elevator_lock is safe here * because we're called from nr_hw_queue update which is protected by * set->update_nr_hwq_lock in the writer context. So, scheduler update/ * switch code (which acquires the same lock in the reader context) * can't run concurrently. */ if (q->elevator) { ctx = xa_load(elv_tbl, q->id); if (WARN_ON_ONCE(!ctx)) return -ENOENT; ctx->name = q->elevator->type->elevator_name; /* * Before we switch elevator to 'none', take a reference to * the elevator module so that while nr_hw_queue update is * running, no one can remove elevator module. We'd put the * reference to elevator module later when we switch back * elevator. */ __elevator_get(q->elevator->type); /* * Store elevator type so that we can release the reference * taken above later. */ ctx->type = q->elevator->type; elevator_set_none(q); } return 0; } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { struct request_queue *q; int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; int i; struct xarray elv_tbl; bool queues_frozen = false; lockdep_assert_held(&set->tag_list_lock); if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) nr_hw_queues = nr_cpu_ids; if (nr_hw_queues < 1) return; if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) return; memflags = memalloc_noio_save(); xa_init(&elv_tbl); if (blk_mq_alloc_sched_ctx_batch(&elv_tbl, set) < 0) goto out_free_ctx; if (blk_mq_alloc_sched_res_batch(&elv_tbl, set, nr_hw_queues) < 0) goto out_free_ctx; list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); blk_mq_sysfs_unregister_hctxs(q); } /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done * updating the new sw to hw queue mappings. */ list_for_each_entry(q, &set->tag_list, tag_set_list) if (blk_mq_elv_switch_none(q, &elv_tbl)) goto switch_back; list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue_nomemsave(q); queues_frozen = true; if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) goto switch_back; fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { __blk_mq_realloc_hw_ctxs(set, q); if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", nr_hw_queues, prev_nr_hw_queues); for (; i < set->nr_hw_queues; i++) __blk_mq_free_map_and_rqs(set, i); set->nr_hw_queues = prev_nr_hw_queues; goto fallback; } blk_mq_map_swqueue(q); } switch_back: /* The blk_mq_elv_switch_back unfreezes queue for us. */ list_for_each_entry(q, &set->tag_list, tag_set_list) { /* switch_back expects queue to be frozen */ if (!queues_frozen) blk_mq_freeze_queue_nomemsave(q); blk_mq_elv_switch_back(q, &elv_tbl); } list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register_hctxs(q); blk_mq_debugfs_register_hctxs(q); blk_mq_remove_hw_queues_cpuhp(q); blk_mq_add_hw_queues_cpuhp(q); } out_free_ctx: blk_mq_free_sched_ctx_batch(&elv_tbl); xa_destroy(&elv_tbl); memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) __blk_mq_free_map_and_rqs(set, i); } void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { down_write(&set->update_nr_hwq_lock); mutex_lock(&set->tag_list_lock); __blk_mq_update_nr_hw_queues(set, nr_hw_queues); mutex_unlock(&set->tag_list_lock); up_write(&set->update_nr_hwq_lock); } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob, unsigned int flags) { int ret; do { ret = q->mq_ops->poll(hctx, iob); if (ret > 0) return ret; if (task_sigpending(current)) return 1; if (ret < 0 || (flags & BLK_POLL_ONESHOT)) break; cpu_relax(); } while (!need_resched()); return 0; } int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags) { if (!blk_mq_can_poll(q)) return 0; return blk_hctx_poll(q, q->queue_hw_ctx[cookie], iob, flags); } int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, unsigned int poll_flags) { struct request_queue *q = rq->q; int ret; if (!blk_rq_is_poll(rq)) return 0; if (!percpu_ref_tryget(&q->q_usage_counter)) return 0; ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags); blk_queue_exit(q); return ret; } EXPORT_SYMBOL_GPL(blk_rq_poll); unsigned int blk_mq_rq_cpu(struct request *rq) { return rq->mq_ctx->cpu; } EXPORT_SYMBOL(blk_mq_rq_cpu); void blk_mq_cancel_work_sync(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; cancel_delayed_work_sync(&q->requeue_work); queue_for_each_hw_ctx(q, hctx, i) cancel_delayed_work_sync(&hctx->run_work); } static int __init blk_mq_init(void) { int i; for_each_possible_cpu(i) init_llist_head(&per_cpu(blk_cpu_done, i)); for_each_possible_cpu(i) INIT_CSD(&per_cpu(blk_cpu_csd, i), __blk_mq_complete_request_remote, NULL); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, "block/softirq:dead", NULL, blk_softirq_cpu_dead); cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", blk_mq_hctx_notify_online, blk_mq_hctx_notify_offline); return 0; } subsys_initcall(blk_mq_init);
4 3 3 4 3 4 4 4 3 4 3 3 2 2 3 4 4 4 3 3 4 4 3 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2025 Christoph Hellwig */ #include <linux/blk-integrity.h> #include <linux/blk-mq-dma.h> #include "blk.h" struct phys_vec { phys_addr_t paddr; u32 len; }; static bool __blk_map_iter_next(struct blk_map_iter *iter) { if (iter->iter.bi_size) return true; if (!iter->bio || !iter->bio->bi_next) return false; iter->bio = iter->bio->bi_next; if (iter->is_integrity) { iter->iter = bio_integrity(iter->bio)->bip_iter; iter->bvecs = bio_integrity(iter->bio)->bip_vec; } else { iter->iter = iter->bio->bi_iter; iter->bvecs = iter->bio->bi_io_vec; } return true; } static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, struct phys_vec *vec) { unsigned int max_size; struct bio_vec bv; if (!iter->iter.bi_size) return false; bv = mp_bvec_iter_bvec(iter->bvecs, iter->iter); vec->paddr = bvec_phys(&bv); max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); bv.bv_len = min(bv.bv_len, max_size); bvec_iter_advance_single(iter->bvecs, &iter->iter, bv.bv_len); /* * If we are entirely done with this bi_io_vec entry, check if the next * one could be merged into it. This typically happens when moving to * the next bio, but some callers also don't pack bvecs tight. */ while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { struct bio_vec next; if (!__blk_map_iter_next(iter)) break; next = mp_bvec_iter_bvec(iter->bvecs, iter->iter); if (bv.bv_len + next.bv_len > max_size || !biovec_phys_mergeable(req->q, &bv, &next)) break; bv.bv_len += next.bv_len; bvec_iter_advance_single(iter->bvecs, &iter->iter, next.bv_len); } vec->len = bv.bv_len; return true; } /* * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so * we need to ensure our segments are aligned to this as well. * * Note that there is no point in using the slightly more complicated IOVA based * path for single segment mappings. */ static inline bool blk_can_dma_map_iova(struct request *req, struct device *dma_dev) { return !(req_phys_gap_mask(req) & dma_get_merge_boundary(dma_dev)); } static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) { iter->addr = pci_p2pdma_bus_addr_map(iter->p2pdma.mem, vec->paddr); iter->len = vec->len; return true; } static bool blk_dma_map_direct(struct request *req, struct device *dma_dev, struct blk_dma_iter *iter, struct phys_vec *vec) { unsigned int attrs = 0; if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) attrs |= DMA_ATTR_MMIO; iter->addr = dma_map_phys(dma_dev, vec->paddr, vec->len, rq_dma_dir(req), attrs); if (dma_mapping_error(dma_dev, iter->addr)) { iter->status = BLK_STS_RESOURCE; return false; } iter->len = vec->len; return true; } static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter, struct phys_vec *vec) { enum dma_data_direction dir = rq_dma_dir(req); unsigned int mapped = 0; unsigned int attrs = 0; int error; iter->addr = state->addr; iter->len = dma_iova_size(state); if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) attrs |= DMA_ATTR_MMIO; do { error = dma_iova_link(dma_dev, state, vec->paddr, mapped, vec->len, dir, attrs); if (error) break; mapped += vec->len; } while (blk_map_iter_next(req, &iter->iter, vec)); error = dma_iova_sync(dma_dev, state, 0, mapped); if (error) { iter->status = errno_to_blk_status(error); return false; } return true; } static inline void blk_rq_map_iter_init(struct request *rq, struct blk_map_iter *iter) { struct bio *bio = rq->bio; if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { *iter = (struct blk_map_iter) { .bvecs = &rq->special_vec, .iter = { .bi_size = rq->special_vec.bv_len, } }; } else if (bio) { *iter = (struct blk_map_iter) { .bio = bio, .bvecs = bio->bi_io_vec, .iter = bio->bi_iter, }; } else { /* the internal flush request may not have bio attached */ *iter = (struct blk_map_iter) {}; } } static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter, unsigned int total_len) { struct phys_vec vec; memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); iter->status = BLK_STS_OK; iter->p2pdma.map = PCI_P2PDMA_MAP_NONE; /* * Grab the first segment ASAP because we'll need it to check for P2P * transfers. */ if (!blk_map_iter_next(req, &iter->iter, &vec)) return false; switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, phys_to_page(vec.paddr))) { case PCI_P2PDMA_MAP_BUS_ADDR: return blk_dma_map_bus(iter, &vec); case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: /* * P2P transfers through the host bridge are treated the * same as non-P2P transfers below and during unmap. */ case PCI_P2PDMA_MAP_NONE: break; default: iter->status = BLK_STS_INVAL; return false; } if (blk_can_dma_map_iova(req, dma_dev) && dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len)) return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec); memset(state, 0, sizeof(*state)); return blk_dma_map_direct(req, dma_dev, iter, &vec); } /** * blk_rq_dma_map_iter_start - map the first DMA segment for a request * @req: request to map * @dma_dev: device to map to * @state: DMA IOVA state * @iter: block layer DMA iterator * * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the * caller and don't need to be initialized. @state needs to be stored for use * at unmap time, @iter is only needed at map time. * * Returns %false if there is no segment to map, including due to an error, or * %true ft it did map a segment. * * If a segment was mapped, the DMA address for it is returned in @iter.addr and * the length in @iter.len. If no segment was mapped the status code is * returned in @iter.status. * * The caller can call blk_rq_dma_map_coalesce() to check if further segments * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() * to try to map the following segments. */ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter) { blk_rq_map_iter_init(req, &iter->iter); return blk_dma_map_iter_start(req, dma_dev, state, iter, blk_rq_payload_bytes(req)); } EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); /** * blk_rq_dma_map_iter_next - map the next DMA segment for a request * @req: request to map * @dma_dev: device to map to * @state: DMA IOVA state * @iter: block layer DMA iterator * * Iterate to the next mapping after a previous call to * blk_rq_dma_map_iter_start(). See there for a detailed description of the * arguments. * * Returns %false if there is no segment to map, including due to an error, or * %true ft it did map a segment. * * If a segment was mapped, the DMA address for it is returned in @iter.addr and * the length in @iter.len. If no segment was mapped the status code is * returned in @iter.status. */ bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter) { struct phys_vec vec; if (!blk_map_iter_next(req, &iter->iter, &vec)) return false; if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) return blk_dma_map_bus(iter, &vec); return blk_dma_map_direct(req, dma_dev, iter, &vec); } EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next); static inline struct scatterlist * blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) { if (!*sg) return sglist; /* * If the driver previously mapped a shorter list, we could see a * termination bit prematurely unless it fully inits the sg table * on each mapping. We KNOW that there must be more entries here * or the driver would be buggy, so force clear the termination bit * to avoid doing a full sg_init_table() in drivers for each command. */ sg_unmark_end(*sg); return sg_next(*sg); } /* * Map a request to scatterlist, return number of sg entries setup. Caller * must make sure sg can hold rq->nr_phys_segments entries. */ int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg) { struct blk_map_iter iter; struct phys_vec vec; int nsegs = 0; blk_rq_map_iter_init(rq, &iter); while (blk_map_iter_next(rq, &iter, &vec)) { *last_sg = blk_next_sg(last_sg, sglist); sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, offset_in_page(vec.paddr)); nsegs++; } if (*last_sg) sg_mark_end(*last_sg); /* * Something must have been wrong if the figured number of * segment is bigger than number of req's physical segments */ WARN_ON(nsegs > blk_rq_nr_phys_segments(rq)); return nsegs; } EXPORT_SYMBOL(__blk_rq_map_sg); #ifdef CONFIG_BLK_DEV_INTEGRITY /** * blk_rq_integrity_dma_map_iter_start - map the first integrity DMA segment * for a request * @req: request to map * @dma_dev: device to map to * @state: DMA IOVA state * @iter: block layer DMA iterator * * Start DMA mapping @req integrity data to @dma_dev. @state and @iter are * provided by the caller and don't need to be initialized. @state needs to be * stored for use at unmap time, @iter is only needed at map time. * * Returns %false if there is no segment to map, including due to an error, or * %true if it did map a segment. * * If a segment was mapped, the DMA address for it is returned in @iter.addr * and the length in @iter.len. If no segment was mapped the status code is * returned in @iter.status. * * The caller can call blk_rq_dma_map_coalesce() to check if further segments * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() * to try to map the following segments. */ bool blk_rq_integrity_dma_map_iter_start(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter) { unsigned len = bio_integrity_bytes(&req->q->limits.integrity, blk_rq_sectors(req)); struct bio *bio = req->bio; iter->iter = (struct blk_map_iter) { .bio = bio, .iter = bio_integrity(bio)->bip_iter, .bvecs = bio_integrity(bio)->bip_vec, .is_integrity = true, }; return blk_dma_map_iter_start(req, dma_dev, state, iter, len); } EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start); /** * blk_rq_integrity_dma_map_iter_next - map the next integrity DMA segment for * a request * @req: request to map * @dma_dev: device to map to * @state: DMA IOVA state * @iter: block layer DMA iterator * * Iterate to the next integrity mapping after a previous call to * blk_rq_integrity_dma_map_iter_start(). See there for a detailed description * of the arguments. * * Returns %false if there is no segment to map, including due to an error, or * %true if it did map a segment. * * If a segment was mapped, the DMA address for it is returned in @iter.addr and * the length in @iter.len. If no segment was mapped the status code is * returned in @iter.status. */ bool blk_rq_integrity_dma_map_iter_next(struct request *req, struct device *dma_dev, struct blk_dma_iter *iter) { struct phys_vec vec; if (!blk_map_iter_next(req, &iter->iter, &vec)) return false; if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) return blk_dma_map_bus(iter, &vec); return blk_dma_map_direct(req, dma_dev, iter, &vec); } EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next); /** * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist * @rq: request to map * @sglist: target scatterlist * * Description: Map the integrity vectors in request into a * scatterlist. The scatterlist must be big enough to hold all * elements. I.e. sized using blk_rq_count_integrity_sg() or * rq->nr_integrity_segments. */ int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) { struct request_queue *q = rq->q; struct scatterlist *sg = NULL; struct bio *bio = rq->bio; unsigned int segments = 0; struct phys_vec vec; struct blk_map_iter iter = { .bio = bio, .iter = bio_integrity(bio)->bip_iter, .bvecs = bio_integrity(bio)->bip_vec, .is_integrity = true, }; while (blk_map_iter_next(rq, &iter, &vec)) { sg = blk_next_sg(&sg, sglist); sg_set_page(sg, phys_to_page(vec.paddr), vec.len, offset_in_page(vec.paddr)); segments++; } if (sg) sg_mark_end(sg); /* * Something must have been wrong if the figured number of segment * is bigger than number of req's physical integrity segments */ BUG_ON(segments > rq->nr_integrity_segments); BUG_ON(segments > queue_max_integrity_segments(q)); return segments; } EXPORT_SYMBOL(blk_rq_map_integrity_sg); #endif
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 // SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. ** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. ** ** ******************************************************************************* ******************************************************************************/ #include <linux/kernel.h> #include <linux/init.h> #include <linux/configfs.h> #include <linux/slab.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/dlmconstants.h> #include <net/ipv6.h> #include <net/sock.h> #include "config.h" #include "midcomms.h" #include "lowcomms.h" /* * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid (refers to <node>) * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/release_recover * /config/dlm/<cluster>/comms/<comm>/nodeid (refers to <comm>) * /config/dlm/<cluster>/comms/<comm>/local * /config/dlm/<cluster>/comms/<comm>/addr (write only) * /config/dlm/<cluster>/comms/<comm>/addr_list (read only) * The <cluster> level is useless, but I haven't figured out how to avoid it. */ static struct config_group *space_list; static struct config_group *comm_list; static struct dlm_comm *local_comm; static uint32_t dlm_comm_count; struct dlm_clusters; struct dlm_cluster; struct dlm_spaces; struct dlm_space; struct dlm_comms; struct dlm_comm; struct dlm_nodes; struct dlm_node; static struct config_group *make_cluster(struct config_group *, const char *); static void drop_cluster(struct config_group *, struct config_item *); static void release_cluster(struct config_item *); static struct config_group *make_space(struct config_group *, const char *); static void drop_space(struct config_group *, struct config_item *); static void release_space(struct config_item *); static struct config_item *make_comm(struct config_group *, const char *); static void drop_comm(struct config_group *, struct config_item *); static void release_comm(struct config_item *); static struct config_item *make_node(struct config_group *, const char *); static void drop_node(struct config_group *, struct config_item *); static void release_node(struct config_item *); static struct configfs_attribute *comm_attrs[]; static struct configfs_attribute *node_attrs[]; const struct rhashtable_params dlm_rhash_rsb_params = { .nelem_hint = 3, /* start small */ .key_len = DLM_RESNAME_MAXLEN, .key_offset = offsetof(struct dlm_rsb, res_name), .head_offset = offsetof(struct dlm_rsb, res_node), .automatic_shrinking = true, }; struct dlm_cluster { struct config_group group; struct dlm_spaces *sps; struct dlm_comms *cms; }; static struct dlm_cluster *config_item_to_cluster(struct config_item *i) { return i ? container_of(to_config_group(i), struct dlm_cluster, group) : NULL; } enum { CLUSTER_ATTR_TCP_PORT = 0, CLUSTER_ATTR_BUFFER_SIZE, CLUSTER_ATTR_RSBTBL_SIZE, CLUSTER_ATTR_RECOVER_TIMER, CLUSTER_ATTR_TOSS_SECS, CLUSTER_ATTR_SCAN_SECS, CLUSTER_ATTR_LOG_DEBUG, CLUSTER_ATTR_LOG_INFO, CLUSTER_ATTR_PROTOCOL, CLUSTER_ATTR_MARK, CLUSTER_ATTR_NEW_RSB_COUNT, CLUSTER_ATTR_RECOVER_CALLBACKS, CLUSTER_ATTR_CLUSTER_NAME, }; static ssize_t cluster_cluster_name_show(struct config_item *item, char *buf) { return sprintf(buf, "%s\n", dlm_config.ci_cluster_name); } static ssize_t cluster_cluster_name_store(struct config_item *item, const char *buf, size_t len) { strscpy(dlm_config.ci_cluster_name, buf, sizeof(dlm_config.ci_cluster_name)); return len; } CONFIGFS_ATTR(cluster_, cluster_name); static ssize_t cluster_tcp_port_show(struct config_item *item, char *buf) { return sprintf(buf, "%u\n", be16_to_cpu(dlm_config.ci_tcp_port)); } static int dlm_check_zero_and_dlm_running(unsigned int x) { if (!x) return -EINVAL; if (dlm_lowcomms_is_running()) return -EBUSY; return 0; } static ssize_t cluster_tcp_port_store(struct config_item *item, const char *buf, size_t len) { int rc; u16 x; if (!capable(CAP_SYS_ADMIN)) return -EPERM; rc = kstrtou16(buf, 0, &x); if (rc) return rc; rc = dlm_check_zero_and_dlm_running(x); if (rc) return rc; dlm_config.ci_tcp_port = cpu_to_be16(x); return len; } CONFIGFS_ATTR(cluster_, tcp_port); static ssize_t cluster_set(unsigned int *info_field, int (*check_cb)(unsigned int x), const char *buf, size_t len) { unsigned int x; int rc; if (!capable(CAP_SYS_ADMIN)) return -EPERM; rc = kstrtouint(buf, 0, &x); if (rc) return rc; if (check_cb) { rc = check_cb(x); if (rc) return rc; } *info_field = x; return len; } #define CLUSTER_ATTR(name, check_cb) \ static ssize_t cluster_##name##_store(struct config_item *item, \ const char *buf, size_t len) \ { \ return cluster_set(&dlm_config.ci_##name, check_cb, buf, len); \ } \ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ { \ return snprintf(buf, PAGE_SIZE, "%u\n", dlm_config.ci_##name); \ } \ CONFIGFS_ATTR(cluster_, name); static int dlm_check_protocol_and_dlm_running(unsigned int x) { switch (x) { case 0: /* TCP */ break; case 1: /* SCTP */ if (!IS_ENABLED(CONFIG_IP_SCTP)) return -EOPNOTSUPP; break; default: return -EINVAL; } if (dlm_lowcomms_is_running()) return -EBUSY; return 0; } static int dlm_check_zero(unsigned int x) { if (!x) return -EINVAL; return 0; } static int dlm_check_buffer_size(unsigned int x) { if (x < DLM_MAX_SOCKET_BUFSIZE) return -EINVAL; return 0; } CLUSTER_ATTR(buffer_size, dlm_check_buffer_size); CLUSTER_ATTR(rsbtbl_size, dlm_check_zero); CLUSTER_ATTR(recover_timer, dlm_check_zero); CLUSTER_ATTR(toss_secs, dlm_check_zero); CLUSTER_ATTR(scan_secs, dlm_check_zero); CLUSTER_ATTR(log_debug, NULL); CLUSTER_ATTR(log_info, NULL); CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running); CLUSTER_ATTR(mark, NULL); CLUSTER_ATTR(new_rsb_count, NULL); CLUSTER_ATTR(recover_callbacks, NULL); static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port, [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size, [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size, [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer, [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs, [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs, [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug, [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info, [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol, [CLUSTER_ATTR_MARK] = &cluster_attr_mark, [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count, [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks, [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name, NULL, }; enum { COMM_ATTR_NODEID = 0, COMM_ATTR_LOCAL, COMM_ATTR_ADDR, COMM_ATTR_ADDR_LIST, COMM_ATTR_MARK, }; enum { NODE_ATTR_NODEID = 0, NODE_ATTR_WEIGHT, NODE_ATTR_RELEASE_RECOVER, }; struct dlm_clusters { struct configfs_subsystem subsys; }; struct dlm_spaces { struct config_group ss_group; }; struct dlm_space { struct config_group group; struct list_head members; struct list_head members_gone; int members_gone_count; struct mutex members_lock; int members_count; struct dlm_nodes *nds; }; struct dlm_comms { struct config_group cs_group; }; struct dlm_comm { struct config_item item; int seq; int nodeid; int local; int addr_count; unsigned int mark; struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; }; struct dlm_nodes { struct config_group ns_group; }; struct dlm_node { struct config_item item; struct list_head list; /* space->members */ int nodeid; int weight; int new; int comm_seq; /* copy of cm->seq when nd->nodeid is set */ unsigned int release_recover; }; struct dlm_member_gone { int nodeid; unsigned int release_recover; struct list_head list; /* space->members_gone */ }; static struct configfs_group_operations clusters_ops = { .make_group = make_cluster, .drop_item = drop_cluster, }; static struct configfs_item_operations cluster_ops = { .release = release_cluster, }; static struct configfs_group_operations spaces_ops = { .make_group = make_space, .drop_item = drop_space, }; static struct configfs_item_operations space_ops = { .release = release_space, }; static struct configfs_group_operations comms_ops = { .make_item = make_comm, .drop_item = drop_comm, }; static struct configfs_item_operations comm_ops = { .release = release_comm, }; static struct configfs_group_operations nodes_ops = { .make_item = make_node, .drop_item = drop_node, }; static struct configfs_item_operations node_ops = { .release = release_node, }; static const struct config_item_type clusters_type = { .ct_group_ops = &clusters_ops, .ct_owner = THIS_MODULE, }; static const struct config_item_type cluster_type = { .ct_item_ops = &cluster_ops, .ct_attrs = cluster_attrs, .ct_owner = THIS_MODULE, }; static const struct config_item_type spaces_type = { .ct_group_ops = &spaces_ops, .ct_owner = THIS_MODULE, }; static const struct config_item_type space_type = { .ct_item_ops = &space_ops, .ct_owner = THIS_MODULE, }; static const struct config_item_type comms_type = { .ct_group_ops = &comms_ops, .ct_owner = THIS_MODULE, }; static const struct config_item_type comm_type = { .ct_item_ops = &comm_ops, .ct_attrs = comm_attrs, .ct_owner = THIS_MODULE, }; static const struct config_item_type nodes_type = { .ct_group_ops = &nodes_ops, .ct_owner = THIS_MODULE, }; static const struct config_item_type node_type = { .ct_item_ops = &node_ops, .ct_attrs = node_attrs, .ct_owner = THIS_MODULE, }; static struct dlm_space *config_item_to_space(struct config_item *i) { return i ? container_of(to_config_group(i), struct dlm_space, group) : NULL; } static struct dlm_comm *config_item_to_comm(struct config_item *i) { return i ? container_of(i, struct dlm_comm, item) : NULL; } static struct dlm_node *config_item_to_node(struct config_item *i) { return i ? container_of(i, struct dlm_node, item) : NULL; } static struct config_group *make_cluster(struct config_group *g, const char *name) { struct dlm_cluster *cl = NULL; struct dlm_spaces *sps = NULL; struct dlm_comms *cms = NULL; cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS); sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS); cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS); if (!cl || !sps || !cms) goto fail; cl->sps = sps; cl->cms = cms; config_group_init_type_name(&cl->group, name, &cluster_type); config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type); config_group_init_type_name(&cms->cs_group, "comms", &comms_type); configfs_add_default_group(&sps->ss_group, &cl->group); configfs_add_default_group(&cms->cs_group, &cl->group); space_list = &sps->ss_group; comm_list = &cms->cs_group; return &cl->group; fail: kfree(cl); kfree(sps); kfree(cms); return ERR_PTR(-ENOMEM); } static void drop_cluster(struct config_group *g, struct config_item *i) { struct dlm_cluster *cl = config_item_to_cluster(i); configfs_remove_default_groups(&cl->group); space_list = NULL; comm_list = NULL; config_item_put(i); } static void release_cluster(struct config_item *i) { struct dlm_cluster *cl = config_item_to_cluster(i); kfree(cl->sps); kfree(cl->cms); kfree(cl); } static struct config_group *make_space(struct config_group *g, const char *name) { struct dlm_space *sp = NULL; struct dlm_nodes *nds = NULL; sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS); nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS); if (!sp || !nds) goto fail; config_group_init_type_name(&sp->group, name, &space_type); config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type); configfs_add_default_group(&nds->ns_group, &sp->group); INIT_LIST_HEAD(&sp->members); INIT_LIST_HEAD(&sp->members_gone); mutex_init(&sp->members_lock); sp->members_count = 0; sp->nds = nds; return &sp->group; fail: kfree(sp); kfree(nds); return ERR_PTR(-ENOMEM); } static void drop_space(struct config_group *g, struct config_item *i) { struct dlm_space *sp = config_item_to_space(i); /* assert list_empty(&sp->members) */ configfs_remove_default_groups(&sp->group); config_item_put(i); } static void release_space(struct config_item *i) { struct dlm_space *sp = config_item_to_space(i); kfree(sp->nds); kfree(sp); } static struct config_item *make_comm(struct config_group *g, const char *name) { struct dlm_comm *cm; unsigned int nodeid; int rv; rv = kstrtouint(name, 0, &nodeid); if (rv) return ERR_PTR(rv); cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS); if (!cm) return ERR_PTR(-ENOMEM); config_item_init_type_name(&cm->item, name, &comm_type); cm->seq = dlm_comm_count++; if (!cm->seq) cm->seq = dlm_comm_count++; cm->nodeid = nodeid; cm->local = 0; cm->addr_count = 0; cm->mark = 0; return &cm->item; } static void drop_comm(struct config_group *g, struct config_item *i) { struct dlm_comm *cm = config_item_to_comm(i); if (local_comm == cm) local_comm = NULL; dlm_midcomms_close(cm->nodeid); while (cm->addr_count--) kfree(cm->addr[cm->addr_count]); config_item_put(i); } static void release_comm(struct config_item *i) { struct dlm_comm *cm = config_item_to_comm(i); kfree(cm); } static struct config_item *make_node(struct config_group *g, const char *name) { struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); unsigned int nodeid; struct dlm_node *nd; uint32_t seq = 0; int rv; rv = kstrtouint(name, 0, &nodeid); if (rv) return ERR_PTR(rv); nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS); if (!nd) return ERR_PTR(-ENOMEM); config_item_init_type_name(&nd->item, name, &node_type); nd->nodeid = nodeid; nd->weight = 1; /* default weight of 1 if none is set */ nd->new = 1; /* set to 0 once it's been read by dlm_nodeid_list() */ dlm_comm_seq(nodeid, &seq, true); nd->comm_seq = seq; mutex_lock(&sp->members_lock); list_add(&nd->list, &sp->members); sp->members_count++; mutex_unlock(&sp->members_lock); return &nd->item; } static void drop_node(struct config_group *g, struct config_item *i) { struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); struct dlm_node *nd = config_item_to_node(i); struct dlm_member_gone *mb_gone; mb_gone = kzalloc(sizeof(*mb_gone), GFP_KERNEL); if (!mb_gone) return; mutex_lock(&sp->members_lock); list_del(&nd->list); sp->members_count--; mb_gone->nodeid = nd->nodeid; mb_gone->release_recover = nd->release_recover; list_add(&mb_gone->list, &sp->members_gone); sp->members_gone_count++; mutex_unlock(&sp->members_lock); config_item_put(i); } static void release_node(struct config_item *i) { struct dlm_node *nd = config_item_to_node(i); kfree(nd); } static struct dlm_clusters clusters_root = { .subsys = { .su_group = { .cg_item = { .ci_namebuf = "dlm", .ci_type = &clusters_type, }, }, }, }; int __init dlm_config_init(void) { config_group_init(&clusters_root.subsys.su_group); mutex_init(&clusters_root.subsys.su_mutex); return configfs_register_subsystem(&clusters_root.subsys); } void dlm_config_exit(void) { configfs_unregister_subsystem(&clusters_root.subsys); } /* * Functions for user space to read/write attributes */ static ssize_t comm_nodeid_show(struct config_item *item, char *buf) { unsigned int nodeid; int rv; rv = kstrtouint(config_item_name(item), 0, &nodeid); if (WARN_ON(rv)) return rv; return sprintf(buf, "%u\n", nodeid); } static ssize_t comm_nodeid_store(struct config_item *item, const char *buf, size_t len) { return len; } static ssize_t comm_local_show(struct config_item *item, char *buf) { return sprintf(buf, "%d\n", config_item_to_comm(item)->local); } static ssize_t comm_local_store(struct config_item *item, const char *buf, size_t len) { struct dlm_comm *cm = config_item_to_comm(item); int rc = kstrtoint(buf, 0, &cm->local); if (rc) return rc; if (cm->local && !local_comm) local_comm = cm; return len; } static ssize_t comm_addr_store(struct config_item *item, const char *buf, size_t len) { struct dlm_comm *cm = config_item_to_comm(item); struct sockaddr_storage *addr; int rv; if (len != sizeof(struct sockaddr_storage)) return -EINVAL; if (cm->addr_count >= DLM_MAX_ADDR_COUNT) return -ENOSPC; addr = kzalloc(sizeof(*addr), GFP_NOFS); if (!addr) return -ENOMEM; memcpy(addr, buf, len); rv = dlm_midcomms_addr(cm->nodeid, addr); if (rv) { kfree(addr); return rv; } cm->addr[cm->addr_count++] = addr; return len; } static ssize_t comm_addr_list_show(struct config_item *item, char *buf) { struct dlm_comm *cm = config_item_to_comm(item); ssize_t s; ssize_t allowance; int i; struct sockaddr_storage *addr; struct sockaddr_in *addr_in; struct sockaddr_in6 *addr_in6; /* Taken from ip6_addr_string() defined in lib/vsprintf.c */ char buf0[sizeof("AF_INET6 xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255\n")]; /* Derived from SIMPLE_ATTR_SIZE of fs/configfs/file.c */ allowance = 4096; buf[0] = '\0'; for (i = 0; i < cm->addr_count; i++) { addr = cm->addr[i]; switch(addr->ss_family) { case AF_INET: addr_in = (struct sockaddr_in *)addr; s = sprintf(buf0, "AF_INET %pI4\n", &addr_in->sin_addr.s_addr); break; case AF_INET6: addr_in6 = (struct sockaddr_in6 *)addr; s = sprintf(buf0, "AF_INET6 %pI6\n", &addr_in6->sin6_addr); break; default: s = sprintf(buf0, "%s\n", "<UNKNOWN>"); break; } allowance -= s; if (allowance >= 0) strcat(buf, buf0); else { allowance += s; break; } } return 4096 - allowance; } static ssize_t comm_mark_show(struct config_item *item, char *buf) { return sprintf(buf, "%u\n", config_item_to_comm(item)->mark); } static ssize_t comm_mark_store(struct config_item *item, const char *buf, size_t len) { struct dlm_comm *comm; unsigned int mark; int rc; rc = kstrtouint(buf, 0, &mark); if (rc) return rc; if (mark == 0) mark = dlm_config.ci_mark; comm = config_item_to_comm(item); rc = dlm_lowcomms_nodes_set_mark(comm->nodeid, mark); if (rc) return rc; comm->mark = mark; return len; } CONFIGFS_ATTR(comm_, nodeid); CONFIGFS_ATTR(comm_, local); CONFIGFS_ATTR(comm_, mark); CONFIGFS_ATTR_WO(comm_, addr); CONFIGFS_ATTR_RO(comm_, addr_list); static struct configfs_attribute *comm_attrs[] = { [COMM_ATTR_NODEID] = &comm_attr_nodeid, [COMM_ATTR_LOCAL] = &comm_attr_local, [COMM_ATTR_ADDR] = &comm_attr_addr, [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list, [COMM_ATTR_MARK] = &comm_attr_mark, NULL, }; static ssize_t node_nodeid_show(struct config_item *item, char *buf) { unsigned int nodeid; int rv; rv = kstrtouint(config_item_name(item), 0, &nodeid); if (WARN_ON(rv)) return rv; return sprintf(buf, "%u\n", nodeid); } static ssize_t node_nodeid_store(struct config_item *item, const char *buf, size_t len) { return len; } static ssize_t node_weight_show(struct config_item *item, char *buf) { return sprintf(buf, "%d\n", config_item_to_node(item)->weight); } static ssize_t node_weight_store(struct config_item *item, const char *buf, size_t len) { int rc = kstrtoint(buf, 0, &config_item_to_node(item)->weight); if (rc) return rc; return len; } static ssize_t node_release_recover_show(struct config_item *item, char *buf) { struct dlm_node *n = config_item_to_node(item); return sprintf(buf, "%u\n", n->release_recover); } static ssize_t node_release_recover_store(struct config_item *item, const char *buf, size_t len) { struct dlm_node *n = config_item_to_node(item); int rc; rc = kstrtouint(buf, 0, &n->release_recover); if (rc) return rc; return len; } CONFIGFS_ATTR(node_, nodeid); CONFIGFS_ATTR(node_, weight); CONFIGFS_ATTR(node_, release_recover); static struct configfs_attribute *node_attrs[] = { [NODE_ATTR_NODEID] = &node_attr_nodeid, [NODE_ATTR_WEIGHT] = &node_attr_weight, [NODE_ATTR_RELEASE_RECOVER] = &node_attr_release_recover, NULL, }; /* * Functions for the dlm to get the info that's been configured */ static struct dlm_space *get_space(char *name) { struct config_item *i; if (!space_list) return NULL; mutex_lock(&space_list->cg_subsys->su_mutex); i = config_group_find_item(space_list, name); mutex_unlock(&space_list->cg_subsys->su_mutex); return config_item_to_space(i); } static void put_space(struct dlm_space *sp) { config_item_put(&sp->group.cg_item); } static struct dlm_comm *get_comm(int nodeid) { struct config_item *i; struct dlm_comm *cm = NULL; int found = 0; if (!comm_list) return NULL; WARN_ON_ONCE(!mutex_is_locked(&clusters_root.subsys.su_mutex)); list_for_each_entry(i, &comm_list->cg_children, ci_entry) { cm = config_item_to_comm(i); if (cm->nodeid != nodeid) continue; found = 1; config_item_get(i); break; } if (!found) cm = NULL; return cm; } static void put_comm(struct dlm_comm *cm) { config_item_put(&cm->item); } /* caller must free mem */ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out) { struct dlm_member_gone *mb_gone, *mb_safe; struct dlm_config_node *nodes, *node; struct dlm_space *sp; struct dlm_node *nd; int rv, count; sp = get_space(lsname); if (!sp) return -EEXIST; mutex_lock(&sp->members_lock); if (!sp->members_count) { rv = -EINVAL; printk(KERN_ERR "dlm: zero members_count\n"); goto out; } count = sp->members_count + sp->members_gone_count; nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS); if (!nodes) { rv = -ENOMEM; goto out; } node = nodes; list_for_each_entry(nd, &sp->members, list) { node->nodeid = nd->nodeid; node->weight = nd->weight; node->new = nd->new; node->comm_seq = nd->comm_seq; node++; nd->new = 0; } /* we delay the remove on nodes until here as configfs does * not support addtional attributes for rmdir(). */ list_for_each_entry_safe(mb_gone, mb_safe, &sp->members_gone, list) { node->nodeid = mb_gone->nodeid; node->release_recover = mb_gone->release_recover; node->gone = true; node++; list_del(&mb_gone->list); sp->members_gone_count--; kfree(mb_gone); } *count_out = count; *nodes_out = nodes; rv = 0; out: mutex_unlock(&sp->members_lock); put_space(sp); return rv; } int dlm_comm_seq(int nodeid, uint32_t *seq, bool locked) { struct dlm_comm *cm; if (locked) { cm = get_comm(nodeid); } else { mutex_lock(&clusters_root.subsys.su_mutex); cm = get_comm(nodeid); mutex_unlock(&clusters_root.subsys.su_mutex); } if (!cm) return -ENOENT; *seq = cm->seq; put_comm(cm); return 0; } int dlm_our_nodeid(void) { return local_comm->nodeid; } /* num 0 is first addr, num 1 is second addr */ int dlm_our_addr(struct sockaddr_storage *addr, int num) { if (!local_comm) return -1; if (num + 1 > local_comm->addr_count) return -1; memcpy(addr, local_comm->addr[num], sizeof(*addr)); return 0; } /* Config file defaults */ #define DEFAULT_TCP_PORT 21064 #define DEFAULT_RSBTBL_SIZE 1024 #define DEFAULT_RECOVER_TIMER 5 #define DEFAULT_TOSS_SECS 10 #define DEFAULT_SCAN_SECS 5 #define DEFAULT_LOG_DEBUG 0 #define DEFAULT_LOG_INFO 1 #define DEFAULT_PROTOCOL DLM_PROTO_TCP #define DEFAULT_MARK 0 #define DEFAULT_NEW_RSB_COUNT 128 #define DEFAULT_RECOVER_CALLBACKS 0 #define DEFAULT_CLUSTER_NAME "" struct dlm_config_info dlm_config = { .ci_tcp_port = cpu_to_be16(DEFAULT_TCP_PORT), .ci_buffer_size = DLM_MAX_SOCKET_BUFSIZE, .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE, .ci_recover_timer = DEFAULT_RECOVER_TIMER, .ci_toss_secs = DEFAULT_TOSS_SECS, .ci_scan_secs = DEFAULT_SCAN_SECS, .ci_log_debug = DEFAULT_LOG_DEBUG, .ci_log_info = DEFAULT_LOG_INFO, .ci_protocol = DEFAULT_PROTOCOL, .ci_mark = DEFAULT_MARK, .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT, .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS, .ci_cluster_name = DEFAULT_CLUSTER_NAME };
132 30 29 31 31 30 30 30 6 30 31 30 31 31 31 29 31 2 2 2 2 2 2 2 2 2 2 31 6 31 31 31 31 31 31 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 // SPDX-License-Identifier: GPL-2.0-only /* * Power Management Quality of Service (PM QoS) support base. * * Copyright (C) 2020 Intel Corporation * * Authors: * Mark Gross <mgross@linux.intel.com> * Rafael J. Wysocki <rafael.j.wysocki@intel.com> * * Provided here is an interface for specifying PM QoS dependencies. It allows * entities depending on QoS constraints to register their requests which are * aggregated as appropriate to produce effective constraints (target values) * that can be monitored by entities needing to respect them, either by polling * or through a built-in notification mechanism. * * In addition to the basic functionality, more specific interfaces for managing * global CPU latency QoS requests and frequency QoS requests are provided. */ /*#define DEBUG*/ #include <linux/pm_qos.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/fs.h> #include <linux/device.h> #include <linux/miscdevice.h> #include <linux/string.h> #include <linux/platform_device.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/uaccess.h> #include <linux/export.h> #include <trace/events/power.h> /* * locking rule: all changes to constraints or notifiers lists * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ static DEFINE_SPINLOCK(pm_qos_lock); /** * pm_qos_read_value - Return the current effective constraint value. * @c: List of PM QoS constraint requests. */ s32 pm_qos_read_value(struct pm_qos_constraints *c) { return READ_ONCE(c->target_value); } static int pm_qos_get_value(struct pm_qos_constraints *c) { if (plist_head_empty(&c->list)) return c->no_constraint_value; switch (c->type) { case PM_QOS_MIN: return plist_first(&c->list)->prio; case PM_QOS_MAX: return plist_last(&c->list)->prio; default: WARN(1, "Unknown PM QoS type in %s\n", __func__); return PM_QOS_DEFAULT_VALUE; } } static void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) { WRITE_ONCE(c->target_value, value); } /** * pm_qos_update_target - Update a list of PM QoS constraint requests. * @c: List of PM QoS requests. * @node: Target list entry. * @action: Action to carry out (add, update or remove). * @value: New request value for the target list entry. * * Update the given list of PM QoS constraint requests, @c, by carrying an * @action involving the @node list entry and @value on it. * * The recognized values of @action are PM_QOS_ADD_REQ (store @value in @node * and add it to the list), PM_QOS_UPDATE_REQ (remove @node from the list, store * @value in it and add it to the list again), and PM_QOS_REMOVE_REQ (remove * @node from the list, ignore @value). * * Return: 1 if the aggregate constraint value has changed, 0 otherwise. */ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, enum pm_qos_req_action action, int value) { int prev_value, curr_value, new_value; unsigned long flags; spin_lock_irqsave(&pm_qos_lock, flags); prev_value = pm_qos_get_value(c); if (value == PM_QOS_DEFAULT_VALUE) new_value = c->default_value; else new_value = value; switch (action) { case PM_QOS_REMOVE_REQ: plist_del(node, &c->list); break; case PM_QOS_UPDATE_REQ: /* * To change the list, atomically remove, reinit with new value * and add, then see if the aggregate has changed. */ plist_del(node, &c->list); fallthrough; case PM_QOS_ADD_REQ: plist_node_init(node, new_value); plist_add(node, &c->list); break; default: /* no action */ ; } curr_value = pm_qos_get_value(c); pm_qos_set_value(c, curr_value); spin_unlock_irqrestore(&pm_qos_lock, flags); trace_pm_qos_update_target(action, prev_value, curr_value); if (prev_value == curr_value) return 0; if (c->notifiers) blocking_notifier_call_chain(c->notifiers, curr_value, NULL); return 1; } /** * pm_qos_flags_remove_req - Remove device PM QoS flags request. * @pqf: Device PM QoS flags set to remove the request from. * @req: Request to remove from the set. */ static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf, struct pm_qos_flags_request *req) { s32 val = 0; list_del(&req->node); list_for_each_entry(req, &pqf->list, node) val |= req->flags; pqf->effective_flags = val; } /** * pm_qos_update_flags - Update a set of PM QoS flags. * @pqf: Set of PM QoS flags to update. * @req: Request to add to the set, to modify, or to remove from the set. * @action: Action to take on the set. * @val: Value of the request to add or modify. * * Return: 1 if the aggregate constraint value has changed, 0 otherwise. */ bool pm_qos_update_flags(struct pm_qos_flags *pqf, struct pm_qos_flags_request *req, enum pm_qos_req_action action, s32 val) { unsigned long irqflags; s32 prev_value, curr_value; spin_lock_irqsave(&pm_qos_lock, irqflags); prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; switch (action) { case PM_QOS_REMOVE_REQ: pm_qos_flags_remove_req(pqf, req); break; case PM_QOS_UPDATE_REQ: pm_qos_flags_remove_req(pqf, req); fallthrough; case PM_QOS_ADD_REQ: req->flags = val; INIT_LIST_HEAD(&req->node); list_add_tail(&req->node, &pqf->list); pqf->effective_flags |= val; break; default: /* no action */ ; } curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; spin_unlock_irqrestore(&pm_qos_lock, irqflags); trace_pm_qos_update_flags(action, prev_value, curr_value); return prev_value != curr_value; } #ifdef CONFIG_CPU_IDLE /* Definitions related to the CPU latency QoS. */ static struct pm_qos_constraints cpu_latency_constraints = { .list = PLIST_HEAD_INIT(cpu_latency_constraints.list), .target_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, .default_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, .no_constraint_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, .type = PM_QOS_MIN, }; static inline bool cpu_latency_qos_value_invalid(s32 value) { return value < 0 && value != PM_QOS_DEFAULT_VALUE; } /** * cpu_latency_qos_limit - Return current system-wide CPU latency QoS limit. */ s32 cpu_latency_qos_limit(void) { return pm_qos_read_value(&cpu_latency_constraints); } /** * cpu_latency_qos_request_active - Check the given PM QoS request. * @req: PM QoS request to check. * * Return: 'true' if @req has been added to the CPU latency QoS list, 'false' * otherwise. */ bool cpu_latency_qos_request_active(struct pm_qos_request *req) { return req->qos == &cpu_latency_constraints; } EXPORT_SYMBOL_GPL(cpu_latency_qos_request_active); static void cpu_latency_qos_apply(struct pm_qos_request *req, enum pm_qos_req_action action, s32 value) { int ret = pm_qos_update_target(req->qos, &req->node, action, value); if (ret > 0) wake_up_all_idle_cpus(); } /** * cpu_latency_qos_add_request - Add new CPU latency QoS request. * @req: Pointer to a preallocated handle. * @value: Requested constraint value. * * Use @value to initialize the request handle pointed to by @req, insert it as * a new entry to the CPU latency QoS list and recompute the effective QoS * constraint for that list. * * Callers need to save the handle for later use in updates and removal of the * QoS request represented by it. */ void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value) { if (!req || cpu_latency_qos_value_invalid(value)) return; if (cpu_latency_qos_request_active(req)) { WARN(1, KERN_ERR "%s called for already added request\n", __func__); return; } trace_pm_qos_add_request(value); req->qos = &cpu_latency_constraints; cpu_latency_qos_apply(req, PM_QOS_ADD_REQ, value); } EXPORT_SYMBOL_GPL(cpu_latency_qos_add_request); /** * cpu_latency_qos_update_request - Modify existing CPU latency QoS request. * @req : QoS request to update. * @new_value: New requested constraint value. * * Use @new_value to update the QoS request represented by @req in the CPU * latency QoS list along with updating the effective constraint value for that * list. */ void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value) { if (!req || cpu_latency_qos_value_invalid(new_value)) return; if (!cpu_latency_qos_request_active(req)) { WARN(1, KERN_ERR "%s called for unknown object\n", __func__); return; } trace_pm_qos_update_request(new_value); if (new_value == req->node.prio) return; cpu_latency_qos_apply(req, PM_QOS_UPDATE_REQ, new_value); } EXPORT_SYMBOL_GPL(cpu_latency_qos_update_request); /** * cpu_latency_qos_remove_request - Remove existing CPU latency QoS request. * @req: QoS request to remove. * * Remove the CPU latency QoS request represented by @req from the CPU latency * QoS list along with updating the effective constraint value for that list. */ void cpu_latency_qos_remove_request(struct pm_qos_request *req) { if (!req) return; if (!cpu_latency_qos_request_active(req)) { WARN(1, KERN_ERR "%s called for unknown object\n", __func__); return; } trace_pm_qos_remove_request(PM_QOS_DEFAULT_VALUE); cpu_latency_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } EXPORT_SYMBOL_GPL(cpu_latency_qos_remove_request); /* User space interface to the CPU latency QoS via misc device. */ static int cpu_latency_qos_open(struct inode *inode, struct file *filp) { struct pm_qos_request *req; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; cpu_latency_qos_add_request(req, PM_QOS_DEFAULT_VALUE); filp->private_data = req; return 0; } static int cpu_latency_qos_release(struct inode *inode, struct file *filp) { struct pm_qos_request *req = filp->private_data; filp->private_data = NULL; cpu_latency_qos_remove_request(req); kfree(req); return 0; } static ssize_t cpu_latency_qos_read(struct file *filp, char __user *buf, size_t count, loff_t *f_pos) { struct pm_qos_request *req = filp->private_data; unsigned long flags; s32 value; if (!req || !cpu_latency_qos_request_active(req)) return -EINVAL; spin_lock_irqsave(&pm_qos_lock, flags); value = pm_qos_get_value(&cpu_latency_constraints); spin_unlock_irqrestore(&pm_qos_lock, flags); return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); } static ssize_t cpu_latency_qos_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos) { s32 value; if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) return -EFAULT; } else { int ret; ret = kstrtos32_from_user(buf, count, 16, &value); if (ret) return ret; } cpu_latency_qos_update_request(filp->private_data, value); return count; } static const struct file_operations cpu_latency_qos_fops = { .write = cpu_latency_qos_write, .read = cpu_latency_qos_read, .open = cpu_latency_qos_open, .release = cpu_latency_qos_release, .llseek = noop_llseek, }; static struct miscdevice cpu_latency_qos_miscdev = { .minor = MISC_DYNAMIC_MINOR, .name = "cpu_dma_latency", .fops = &cpu_latency_qos_fops, }; #ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP /* The CPU system wakeup latency QoS. */ static struct pm_qos_constraints cpu_wakeup_latency_constraints = { .list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list), .target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, .default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, .no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, .type = PM_QOS_MIN, }; /** * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit. * * Returns the current CPU system wakeup latency QoS limit that may have been * requested by user space. */ s32 cpu_wakeup_latency_qos_limit(void) { return pm_qos_read_value(&cpu_wakeup_latency_constraints); } static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp) { struct pm_qos_request *req; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; req->qos = &cpu_wakeup_latency_constraints; pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ, PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); filp->private_data = req; return 0; } static int cpu_wakeup_latency_qos_release(struct inode *inode, struct file *filp) { struct pm_qos_request *req = filp->private_data; filp->private_data = NULL; pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ, PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); kfree(req); return 0; } static ssize_t cpu_wakeup_latency_qos_read(struct file *filp, char __user *buf, size_t count, loff_t *f_pos) { s32 value = pm_qos_read_value(&cpu_wakeup_latency_constraints); return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); } static ssize_t cpu_wakeup_latency_qos_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos) { struct pm_qos_request *req = filp->private_data; s32 value; if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) return -EFAULT; } else { int ret; ret = kstrtos32_from_user(buf, count, 16, &value); if (ret) return ret; } if (value < 0) return -EINVAL; pm_qos_update_target(req->qos, &req->node, PM_QOS_UPDATE_REQ, value); return count; } static const struct file_operations cpu_wakeup_latency_qos_fops = { .open = cpu_wakeup_latency_qos_open, .release = cpu_wakeup_latency_qos_release, .read = cpu_wakeup_latency_qos_read, .write = cpu_wakeup_latency_qos_write, .llseek = noop_llseek, }; static struct miscdevice cpu_wakeup_latency_qos_miscdev = { .minor = MISC_DYNAMIC_MINOR, .name = "cpu_wakeup_latency", .fops = &cpu_wakeup_latency_qos_fops, }; #endif /* CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP */ static int __init cpu_latency_qos_init(void) { int ret; ret = misc_register(&cpu_latency_qos_miscdev); if (ret < 0) pr_err("%s: %s setup failed\n", __func__, cpu_latency_qos_miscdev.name); #ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP ret = misc_register(&cpu_wakeup_latency_qos_miscdev); if (ret < 0) pr_err("%s: %s setup failed\n", __func__, cpu_wakeup_latency_qos_miscdev.name); #endif return ret; } late_initcall(cpu_latency_qos_init); #endif /* CONFIG_CPU_IDLE */ /* Definitions related to the frequency QoS below. */ static inline bool freq_qos_value_invalid(s32 value) { return value < 0 && value != PM_QOS_DEFAULT_VALUE; } /** * freq_constraints_init - Initialize frequency QoS constraints. * @qos: Frequency QoS constraints to initialize. */ void freq_constraints_init(struct freq_constraints *qos) { struct pm_qos_constraints *c; c = &qos->min_freq; plist_head_init(&c->list); c->target_value = FREQ_QOS_MIN_DEFAULT_VALUE; c->default_value = FREQ_QOS_MIN_DEFAULT_VALUE; c->no_constraint_value = FREQ_QOS_MIN_DEFAULT_VALUE; c->type = PM_QOS_MAX; c->notifiers = &qos->min_freq_notifiers; BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); c = &qos->max_freq; plist_head_init(&c->list); c->target_value = FREQ_QOS_MAX_DEFAULT_VALUE; c->default_value = FREQ_QOS_MAX_DEFAULT_VALUE; c->no_constraint_value = FREQ_QOS_MAX_DEFAULT_VALUE; c->type = PM_QOS_MIN; c->notifiers = &qos->max_freq_notifiers; BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); } /** * freq_qos_read_value - Get frequency QoS constraint for a given list. * @qos: Constraints to evaluate. * @type: QoS request type. */ s32 freq_qos_read_value(struct freq_constraints *qos, enum freq_qos_req_type type) { s32 ret; switch (type) { case FREQ_QOS_MIN: ret = IS_ERR_OR_NULL(qos) ? FREQ_QOS_MIN_DEFAULT_VALUE : pm_qos_read_value(&qos->min_freq); break; case FREQ_QOS_MAX: ret = IS_ERR_OR_NULL(qos) ? FREQ_QOS_MAX_DEFAULT_VALUE : pm_qos_read_value(&qos->max_freq); break; default: WARN_ON(1); ret = 0; } return ret; } /** * freq_qos_apply - Add/modify/remove frequency QoS request. * @req: Constraint request to apply. * @action: Action to perform (add/update/remove). * @value: Value to assign to the QoS request. * * This is only meant to be called from inside pm_qos, not drivers. */ int freq_qos_apply(struct freq_qos_request *req, enum pm_qos_req_action action, s32 value) { int ret; switch(req->type) { case FREQ_QOS_MIN: ret = pm_qos_update_target(&req->qos->min_freq, &req->pnode, action, value); break; case FREQ_QOS_MAX: ret = pm_qos_update_target(&req->qos->max_freq, &req->pnode, action, value); break; default: ret = -EINVAL; } return ret; } /** * freq_qos_add_request - Insert new frequency QoS request into a given list. * @qos: Constraints to update. * @req: Preallocated request object. * @type: Request type. * @value: Request value. * * Insert a new entry into the @qos list of requests, recompute the effective * QoS constraint value for that list and initialize the @req object. The * caller needs to save that object for later use in updates and removal. * * Return 1 if the effective constraint value has changed, 0 if the effective * constraint value has not changed, or a negative error code on failures. */ int freq_qos_add_request(struct freq_constraints *qos, struct freq_qos_request *req, enum freq_qos_req_type type, s32 value) { int ret; if (IS_ERR_OR_NULL(qos) || !req || freq_qos_value_invalid(value)) return -EINVAL; if (WARN(freq_qos_request_active(req), "%s() called for active request\n", __func__)) return -EINVAL; req->qos = qos; req->type = type; ret = freq_qos_apply(req, PM_QOS_ADD_REQ, value); if (ret < 0) { req->qos = NULL; req->type = 0; } return ret; } EXPORT_SYMBOL_GPL(freq_qos_add_request); /** * freq_qos_update_request - Modify existing frequency QoS request. * @req: Request to modify. * @new_value: New request value. * * Update an existing frequency QoS request along with the effective constraint * value for the list of requests it belongs to. * * Return 1 if the effective constraint value has changed, 0 if the effective * constraint value has not changed, or a negative error code on failures. */ int freq_qos_update_request(struct freq_qos_request *req, s32 new_value) { if (!req || freq_qos_value_invalid(new_value)) return -EINVAL; if (WARN(!freq_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (req->pnode.prio == new_value) return 0; return freq_qos_apply(req, PM_QOS_UPDATE_REQ, new_value); } EXPORT_SYMBOL_GPL(freq_qos_update_request); /** * freq_qos_remove_request - Remove frequency QoS request from its list. * @req: Request to remove. * * Remove the given frequency QoS request from the list of constraints it * belongs to and recompute the effective constraint value for that list. * * Return 1 if the effective constraint value has changed, 0 if the effective * constraint value has not changed, or a negative error code on failures. */ int freq_qos_remove_request(struct freq_qos_request *req) { int ret; if (!req) return -EINVAL; if (WARN(!freq_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; ret = freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); req->qos = NULL; req->type = 0; return ret; } EXPORT_SYMBOL_GPL(freq_qos_remove_request); /** * freq_qos_add_notifier - Add frequency QoS change notifier. * @qos: List of requests to add the notifier to. * @type: Request type. * @notifier: Notifier block to add. */ int freq_qos_add_notifier(struct freq_constraints *qos, enum freq_qos_req_type type, struct notifier_block *notifier) { int ret; if (IS_ERR_OR_NULL(qos) || !notifier) return -EINVAL; switch (type) { case FREQ_QOS_MIN: ret = blocking_notifier_chain_register(qos->min_freq.notifiers, notifier); break; case FREQ_QOS_MAX: ret = blocking_notifier_chain_register(qos->max_freq.notifiers, notifier); break; default: WARN_ON(1); ret = -EINVAL; } return ret; } EXPORT_SYMBOL_GPL(freq_qos_add_notifier); /** * freq_qos_remove_notifier - Remove frequency QoS change notifier. * @qos: List of requests to remove the notifier from. * @type: Request type. * @notifier: Notifier block to remove. */ int freq_qos_remove_notifier(struct freq_constraints *qos, enum freq_qos_req_type type, struct notifier_block *notifier) { int ret; if (IS_ERR_OR_NULL(qos) || !notifier) return -EINVAL; switch (type) { case FREQ_QOS_MIN: ret = blocking_notifier_chain_unregister(qos->min_freq.notifiers, notifier); break; case FREQ_QOS_MAX: ret = blocking_notifier_chain_unregister(qos->max_freq.notifiers, notifier); break; default: WARN_ON(1); ret = -EINVAL; } return ret; } EXPORT_SYMBOL_GPL(freq_qos_remove_notifier);
9 9 9 28 27 28 28 28 9 9 9 9 9 9 9 9 9 9 30 29 9 9 9 9 9 9 9 28 28 28 28 28 28 28 28 28 28 28 27 28 28 28 28 28 28 28 28 28 28 28 28 28 30 12 3 28 28 1 28 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2009 Sunplus Core Technology Co., Ltd. * Chen Liqin <liqin.chen@sunplusct.com> * Lennox Wu <lennox.wu@sunplusct.com> * Copyright (C) 2012 Regents of the University of California */ #include <linux/compat.h> #include <linux/signal.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/resume_user_mode.h> #include <linux/linkage.h> #include <linux/entry-common.h> #include <asm/ucontext.h> #include <asm/vdso.h> #include <asm/signal.h> #include <asm/signal32.h> #include <asm/switch_to.h> #include <asm/vector.h> #include <asm/csr.h> #include <asm/cacheflush.h> unsigned long signal_minsigstksz __ro_after_init; extern u32 __user_rt_sigreturn[2]; static size_t riscv_v_sc_size __ro_after_init; #define DEBUG_SIG 0 struct rt_sigframe { struct siginfo info; struct ucontext uc; #ifndef CONFIG_MMU u32 sigreturn_code[2]; #endif }; #ifdef CONFIG_FPU static long restore_fp_state(struct pt_regs *regs, union __riscv_fp_state __user *sc_fpregs) { long err; struct __riscv_d_ext_state __user *state = &sc_fpregs->d; err = __copy_from_user(&current->thread.fstate, state, sizeof(*state)); if (unlikely(err)) return err; fstate_restore(current, regs); return 0; } static long save_fp_state(struct pt_regs *regs, union __riscv_fp_state __user *sc_fpregs) { long err; struct __riscv_d_ext_state __user *state = &sc_fpregs->d; fstate_save(current, regs); err = __copy_to_user(state, &current->thread.fstate, sizeof(*state)); return err; } #else #define save_fp_state(task, regs) (0) #define restore_fp_state(task, regs) (0) #endif #ifdef CONFIG_RISCV_ISA_V static long save_v_state(struct pt_regs *regs, void __user **sc_vec) { struct __riscv_ctx_hdr __user *hdr; struct __sc_riscv_v_state __user *state; void __user *datap; long err; hdr = *sc_vec; /* Place state to the user's signal context space after the hdr */ state = (struct __sc_riscv_v_state __user *)(hdr + 1); /* Point datap right after the end of __sc_riscv_v_state */ datap = state + 1; /* datap is designed to be 16 byte aligned for better performance */ WARN_ON(!IS_ALIGNED((unsigned long)datap, 16)); get_cpu_vector_context(); riscv_v_vstate_save(&current->thread.vstate, regs); put_cpu_vector_context(); /* Copy everything of vstate but datap. */ err = __copy_to_user(&state->v_state, &current->thread.vstate, offsetof(struct __riscv_v_ext_state, datap)); /* Copy the pointer datap itself. */ err |= __put_user((__force void *)datap, &state->v_state.datap); /* Copy the whole vector content to user space datap. */ err |= __copy_to_user(datap, current->thread.vstate.datap, riscv_v_vsize); /* Copy magic to the user space after saving all vector conetext */ err |= __put_user(RISCV_V_MAGIC, &hdr->magic); err |= __put_user(riscv_v_sc_size, &hdr->size); if (unlikely(err)) return err; /* Only progress the sv_vec if everything has done successfully */ *sc_vec += riscv_v_sc_size; return 0; } /* * Restore Vector extension context from the user's signal frame. This function * assumes a valid extension header. So magic and size checking must be done by * the caller. */ static long __restore_v_state(struct pt_regs *regs, void __user *sc_vec) { long err; struct __sc_riscv_v_state __user *state = sc_vec; void __user *datap; /* * Mark the vstate as clean prior performing the actual copy, * to avoid getting the vstate incorrectly clobbered by the * discarded vector state. */ riscv_v_vstate_set_restore(current, regs); /* Copy everything of __sc_riscv_v_state except datap. */ err = __copy_from_user(&current->thread.vstate, &state->v_state, offsetof(struct __riscv_v_ext_state, datap)); if (unlikely(err)) return err; /* Copy the pointer datap itself. */ err = __get_user(datap, &state->v_state.datap); if (unlikely(err)) return err; /* * Copy the whole vector content from user space datap. Use * copy_from_user to prevent information leak. */ return copy_from_user(current->thread.vstate.datap, datap, riscv_v_vsize); } #else #define save_v_state(task, regs) (0) #define __restore_v_state(task, regs) (0) #endif static long restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) { void __user *sc_ext_ptr = &sc->sc_extdesc.hdr; __u32 rsvd; long err; /* sc_regs is structured the same as the start of pt_regs */ err = __copy_from_user(regs, &sc->sc_regs, sizeof(sc->sc_regs)); if (unlikely(err)) return err; /* Restore the floating-point state. */ if (has_fpu()) { err = restore_fp_state(regs, &sc->sc_fpregs); if (unlikely(err)) return err; } /* Check the reserved word before extensions parsing */ err = __get_user(rsvd, &sc->sc_extdesc.reserved); if (unlikely(err)) return err; if (unlikely(rsvd)) return -EINVAL; while (!err) { __u32 magic, size; struct __riscv_ctx_hdr __user *head = sc_ext_ptr; err |= __get_user(magic, &head->magic); err |= __get_user(size, &head->size); if (unlikely(err)) return err; sc_ext_ptr += sizeof(*head); switch (magic) { case END_MAGIC: if (size != END_HDR_SIZE) return -EINVAL; return 0; case RISCV_V_MAGIC: if (!(has_vector() || has_xtheadvector()) || !riscv_v_vstate_query(regs) || size != riscv_v_sc_size) return -EINVAL; err = __restore_v_state(regs, sc_ext_ptr); break; default: return -EINVAL; } sc_ext_ptr = (void __user *)head + size; } return err; } static size_t get_rt_frame_size(bool cal_all) { struct rt_sigframe __user *frame; size_t frame_size; size_t total_context_size = 0; frame_size = sizeof(*frame); if (has_vector() || has_xtheadvector()) { if (cal_all || riscv_v_vstate_query(task_pt_regs(current))) total_context_size += riscv_v_sc_size; } frame_size += total_context_size; frame_size = round_up(frame_size, 16); return frame_size; } SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; struct task_struct *task; sigset_t set; size_t frame_size = get_rt_frame_size(false); /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; frame = (struct rt_sigframe __user *)regs->sp; if (!access_ok(frame, frame_size)) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) goto badframe; regs->cause = -1UL; return regs->a0; badframe: task = current; if (show_unhandled_signals) { pr_info_ratelimited( "%s[%d]: bad frame in %s: frame=%p pc=%p sp=%p\n", task->comm, task_pid_nr(task), __func__, frame, (void *)regs->epc, (void *)regs->sp); } force_sig(SIGSEGV); return 0; } static long setup_sigcontext(struct rt_sigframe __user *frame, struct pt_regs *regs) { struct sigcontext __user *sc = &frame->uc.uc_mcontext; struct __riscv_ctx_hdr __user *sc_ext_ptr = &sc->sc_extdesc.hdr; long err; /* sc_regs is structured the same as the start of pt_regs */ err = __copy_to_user(&sc->sc_regs, regs, sizeof(sc->sc_regs)); /* Save the floating-point state. */ if (has_fpu()) err |= save_fp_state(regs, &sc->sc_fpregs); /* Save the vector state. */ if ((has_vector() || has_xtheadvector()) && riscv_v_vstate_query(regs)) err |= save_v_state(regs, (void __user **)&sc_ext_ptr); /* Write zero to fp-reserved space and check it on restore_sigcontext */ err |= __put_user(0, &sc->sc_extdesc.reserved); /* And put END __riscv_ctx_hdr at the end. */ err |= __put_user(END_MAGIC, &sc_ext_ptr->magic); err |= __put_user(END_HDR_SIZE, &sc_ext_ptr->size); return err; } static inline void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t framesize) { unsigned long sp; /* Default to using normal stack */ sp = regs->sp; /* * If we are on the alternate signal stack and would overflow it, don't. * Return an always-bogus address instead so we will die with SIGSEGV. */ if (on_sig_stack(sp) && !likely(on_sig_stack(sp - framesize))) return (void __user __force *)(-1UL); /* This is the X/Open sanctioned signal stack switching. */ sp = sigsp(sp, ksig) - framesize; /* Align the stack frame. */ sp &= ~0xfUL; return (void __user *)sp; } static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; long err = 0; unsigned long __maybe_unused addr; size_t frame_size = get_rt_frame_size(false); frame = get_sigframe(ksig, regs, frame_size); if (!access_ok(frame, frame_size)) return -EFAULT; err |= copy_siginfo_to_user(&frame->info, &ksig->info); /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(NULL, &frame->uc.uc_link); err |= __save_altstack(&frame->uc.uc_stack, regs->sp); err |= setup_sigcontext(frame, regs); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) return -EFAULT; /* Set up to return from userspace. */ #ifdef CONFIG_MMU regs->ra = (unsigned long)VDSO_SYMBOL( current->mm->context.vdso, rt_sigreturn); #else /* * For the nommu case we don't have a VDSO. Instead we push two * instructions to call the rt_sigreturn syscall onto the user stack. */ if (copy_to_user(&frame->sigreturn_code, __user_rt_sigreturn, sizeof(frame->sigreturn_code))) return -EFAULT; addr = (unsigned long)&frame->sigreturn_code; /* Make sure the two instructions are pushed to icache. */ flush_icache_range(addr, addr + sizeof(frame->sigreturn_code)); regs->ra = addr; #endif /* CONFIG_MMU */ /* * Set up registers for signal handler. * Registers that we don't modify keep the value they had from * user-space at the time we took the signal. * We always pass siginfo and mcontext, regardless of SA_SIGINFO, * since some things rely on this (e.g. glibc's debug/segfault.c). */ regs->epc = (unsigned long)ksig->ka.sa.sa_handler; regs->sp = (unsigned long)frame; regs->a0 = ksig->sig; /* a0: signal number */ regs->a1 = (unsigned long)(&frame->info); /* a1: siginfo pointer */ regs->a2 = (unsigned long)(&frame->uc); /* a2: ucontext pointer */ #if DEBUG_SIG pr_info("SIG deliver (%s:%d): sig=%d pc=%p ra=%p sp=%p\n", current->comm, task_pid_nr(current), ksig->sig, (void *)regs->epc, (void *)regs->ra, frame); #endif return 0; } static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { sigset_t *oldset = sigmask_to_save(); int ret; rseq_signal_deliver(ksig, regs); /* Set up the stack frame */ if (is_compat_task()) ret = compat_setup_rt_frame(ksig, oldset, regs); else ret = setup_rt_frame(ksig, oldset, regs); signal_setup_done(ret, ksig, 0); } void arch_do_signal_or_restart(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; struct ksignal ksig; bool syscall = (regs->cause == EXC_SYSCALL); /* If we were from a system call, check for system call restarting */ if (syscall) { continue_addr = regs->epc; restart_addr = continue_addr - 4; retval = regs->a0; /* Avoid additional syscall restarting via ret_from_exception */ regs->cause = -1UL; /* * Prepare for system call restart. We do this here so that a * debugger will see the already changed PC. */ switch (retval) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTART_RESTARTBLOCK: regs->a0 = regs->orig_a0; regs->epc = restart_addr; break; } } /* * Get the signal to deliver. When running under ptrace, at this point * the debugger may change all of our registers. */ if (get_signal(&ksig)) { /* * Depending on the signal settings, we may need to revert the * decision to restart the system call, but skip this if a * debugger has chosen to restart at a different PC. */ if (regs->epc == restart_addr && (retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK || (retval == -ERESTARTSYS && !(ksig.ka.sa.sa_flags & SA_RESTART)))) { regs->a0 = -EINTR; regs->epc = continue_addr; } /* Actually deliver the signal */ handle_signal(&ksig, regs); return; } /* * Handle restarting a different system call. As above, if a debugger * has chosen to restart at a different PC, ignore the restart. */ if (syscall && regs->epc == restart_addr && retval == -ERESTART_RESTARTBLOCK) regs->a7 = __NR_restart_syscall; /* * If there is no signal to deliver, we just put the saved * sigmask back. */ restore_saved_sigmask(); } void init_rt_signal_env(void); void __init init_rt_signal_env(void) { riscv_v_sc_size = sizeof(struct __riscv_ctx_hdr) + sizeof(struct __sc_riscv_v_state) + riscv_v_vsize; /* * Determine the stack space required for guaranteed signal delivery. * The signal_minsigstksz will be populated into the AT_MINSIGSTKSZ entry * in the auxiliary array at process startup. */ signal_minsigstksz = get_rt_frame_size(true); } #ifdef CONFIG_DYNAMIC_SIGFRAME bool sigaltstack_size_valid(size_t ss_size) { return ss_size > get_rt_frame_size(false); } #endif /* CONFIG_DYNAMIC_SIGFRAME */
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 3 2 1 2 3 3 3 3 2 2 1 3 3 3 3 1 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 // SPDX-License-Identifier: GPL-2.0-or-later /* mpi-pow.c - MPI functions * Copyright (C) 1994, 1996, 1998, 2000 Free Software Foundation, Inc. * * This file is part of GnuPG. * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. * The GNU MP Library itself is published under the LGPL; * however I decided to publish this code under the plain GPL. */ #include <linux/export.h> #include <linux/sched.h> #include <linux/string.h> #include "mpi-internal.h" #include "longlong.h" /**************** * RES = BASE ^ EXP mod MOD */ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod) { mpi_ptr_t mp_marker = NULL, bp_marker = NULL, ep_marker = NULL; struct karatsuba_ctx karactx = {}; mpi_ptr_t xp_marker = NULL; mpi_ptr_t tspace = NULL; mpi_ptr_t rp, ep, mp, bp; mpi_size_t esize, msize, bsize, rsize; int msign, bsign, rsign; mpi_size_t size; int mod_shift_cnt; int negative_result; int assign_rp = 0; mpi_size_t tsize = 0; /* to avoid compiler warning */ /* fixme: we should check that the warning is void */ int rc = -ENOMEM; esize = exp->nlimbs; msize = mod->nlimbs; size = 2 * msize; msign = mod->sign; rp = res->d; ep = exp->d; if (!msize) return -EINVAL; if (!esize) { /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0 * depending on if MOD equals 1. */ res->nlimbs = (msize == 1 && mod->d[0] == 1) ? 0 : 1; if (res->nlimbs) { if (mpi_resize(res, 1) < 0) goto enomem; rp = res->d; rp[0] = 1; } res->sign = 0; goto leave; } /* Normalize MOD (i.e. make its most significant bit set) as required by * mpn_divrem. This will make the intermediate values in the calculation * slightly larger, but the correct result is obtained after a final * reduction using the original MOD value. */ mp = mp_marker = mpi_alloc_limb_space(msize); if (!mp) goto enomem; mod_shift_cnt = count_leading_zeros(mod->d[msize - 1]); if (mod_shift_cnt) mpihelp_lshift(mp, mod->d, msize, mod_shift_cnt); else MPN_COPY(mp, mod->d, msize); bsize = base->nlimbs; bsign = base->sign; if (bsize > msize) { /* The base is larger than the module. Reduce it. */ /* Allocate (BSIZE + 1) with space for remainder and quotient. * (The quotient is (bsize - msize + 1) limbs.) */ bp = bp_marker = mpi_alloc_limb_space(bsize + 1); if (!bp) goto enomem; MPN_COPY(bp, base->d, bsize); /* We don't care about the quotient, store it above the remainder, * at BP + MSIZE. */ mpihelp_divrem(bp + msize, 0, bp, bsize, mp, msize); bsize = msize; /* Canonicalize the base, since we are going to multiply with it * quite a few times. */ MPN_NORMALIZE(bp, bsize); } else bp = base->d; if (!bsize) { res->nlimbs = 0; res->sign = 0; goto leave; } if (res->alloced < size) { /* We have to allocate more space for RES. If any of the input * parameters are identical to RES, defer deallocation of the old * space. */ if (rp == ep || rp == mp || rp == bp) { rp = mpi_alloc_limb_space(size); if (!rp) goto enomem; assign_rp = 1; } else { if (mpi_resize(res, size) < 0) goto enomem; rp = res->d; } } else { /* Make BASE, EXP and MOD not overlap with RES. */ if (rp == bp) { /* RES and BASE are identical. Allocate temp. space for BASE. */ BUG_ON(bp_marker); bp = bp_marker = mpi_alloc_limb_space(bsize); if (!bp) goto enomem; MPN_COPY(bp, rp, bsize); } if (rp == ep) { /* RES and EXP are identical. Allocate temp. space for EXP. */ ep = ep_marker = mpi_alloc_limb_space(esize); if (!ep) goto enomem; MPN_COPY(ep, rp, esize); } if (rp == mp) { /* RES and MOD are identical. Allocate temporary space for MOD. */ BUG_ON(mp_marker); mp = mp_marker = mpi_alloc_limb_space(msize); if (!mp) goto enomem; MPN_COPY(mp, rp, msize); } } MPN_COPY(rp, bp, bsize); rsize = bsize; rsign = bsign; { mpi_size_t i; mpi_ptr_t xp; int c; mpi_limb_t e; mpi_limb_t carry_limb; xp = xp_marker = mpi_alloc_limb_space(2 * (msize + 1)); if (!xp) goto enomem; negative_result = (ep[0] & 1) && base->sign; i = esize - 1; e = ep[i]; c = count_leading_zeros(e); e = (e << c) << 1; /* shift the exp bits to the left, lose msb */ c = BITS_PER_MPI_LIMB - 1 - c; /* Main loop. * * Make the result be pointed to alternately by XP and RP. This * helps us avoid block copying, which would otherwise be necessary * with the overlap restrictions of mpihelp_divmod. With 50% probability * the result after this loop will be in the area originally pointed * by RP (==RES->d), and with 50% probability in the area originally * pointed to by XP. */ for (;;) { while (c) { mpi_size_t xsize; /*if (mpihelp_mul_n(xp, rp, rp, rsize) < 0) goto enomem */ if (rsize < KARATSUBA_THRESHOLD) mpih_sqr_n_basecase(xp, rp, rsize); else { if (!tspace) { tsize = 2 * rsize; tspace = mpi_alloc_limb_space(tsize); if (!tspace) goto enomem; } else if (tsize < (2 * rsize)) { mpi_free_limb_space(tspace); tsize = 2 * rsize; tspace = mpi_alloc_limb_space(tsize); if (!tspace) goto enomem; } mpih_sqr_n(xp, rp, rsize, tspace); } xsize = 2 * rsize; if (xsize > msize) { mpihelp_divrem(xp + msize, 0, xp, xsize, mp, msize); xsize = msize; } swap(rp, xp); rsize = xsize; if ((mpi_limb_signed_t) e < 0) { /*mpihelp_mul( xp, rp, rsize, bp, bsize ); */ if (bsize < KARATSUBA_THRESHOLD) { mpi_limb_t tmp; if (mpihelp_mul (xp, rp, rsize, bp, bsize, &tmp) < 0) goto enomem; } else { if (mpihelp_mul_karatsuba_case (xp, rp, rsize, bp, bsize, &karactx) < 0) goto enomem; } xsize = rsize + bsize; if (xsize > msize) { mpihelp_divrem(xp + msize, 0, xp, xsize, mp, msize); xsize = msize; } swap(rp, xp); rsize = xsize; } e <<= 1; c--; cond_resched(); } i--; if (i < 0) break; e = ep[i]; c = BITS_PER_MPI_LIMB; } /* We shifted MOD, the modulo reduction argument, left MOD_SHIFT_CNT * steps. Adjust the result by reducing it with the original MOD. * * Also make sure the result is put in RES->d (where it already * might be, see above). */ if (mod_shift_cnt) { carry_limb = mpihelp_lshift(res->d, rp, rsize, mod_shift_cnt); rp = res->d; if (carry_limb) { rp[rsize] = carry_limb; rsize++; } } else { MPN_COPY(res->d, rp, rsize); rp = res->d; } if (rsize >= msize) { mpihelp_divrem(rp + msize, 0, rp, rsize, mp, msize); rsize = msize; } /* Remove any leading zero words from the result. */ if (mod_shift_cnt) mpihelp_rshift(rp, rp, rsize, mod_shift_cnt); MPN_NORMALIZE(rp, rsize); } if (negative_result && rsize) { if (mod_shift_cnt) mpihelp_rshift(mp, mp, msize, mod_shift_cnt); mpihelp_sub(rp, mp, msize, rp, rsize); rsize = msize; rsign = msign; MPN_NORMALIZE(rp, rsize); } res->nlimbs = rsize; res->sign = rsign; leave: rc = 0; enomem: mpihelp_release_karatsuba_ctx(&karactx); if (assign_rp) mpi_assign_limb_space(res, rp, size); if (mp_marker) mpi_free_limb_space(mp_marker); if (bp_marker) mpi_free_limb_space(bp_marker); if (ep_marker) mpi_free_limb_space(ep_marker); if (xp_marker) mpi_free_limb_space(xp_marker); if (tspace) mpi_free_limb_space(tspace); return rc; } EXPORT_SYMBOL_GPL(mpi_powm);
10707 10 10672 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_ACCESS_OK_H__ #define __ASM_GENERIC_ACCESS_OK_H__ /* * Checking whether a pointer is valid for user space access. * These definitions work on most architectures, but overrides can * be used where necessary. */ /* * architectures with compat tasks have a variable TASK_SIZE and should * override this to a constant. */ #ifndef TASK_SIZE_MAX #define TASK_SIZE_MAX TASK_SIZE #endif #ifndef __access_ok /* * 'size' is a compile-time constant for most callers, so optimize for * this case to turn the check into a single comparison against a constant * limit and catch all possible overflows. * On architectures with separate user address space (m68k, s390, parisc, * sparc64) or those without an MMU, this should always return true. * * This version was originally contributed by Jonas Bonn for the * OpenRISC architecture, and was found to be the most efficient * for constant 'size' and 'limit' values. */ static inline int __access_ok(const void __user *ptr, unsigned long size) { unsigned long limit = TASK_SIZE_MAX; unsigned long addr = (unsigned long)ptr; if (IS_ENABLED(CONFIG_ALTERNATE_USER_ADDRESS_SPACE) || !IS_ENABLED(CONFIG_MMU)) return true; return (size <= limit) && (addr <= (limit - size)); } #endif #ifndef access_ok #define access_ok(addr, size) likely(__access_ok(addr, size)) #endif #endif
55 156 371 19 2 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions of the Internet Protocol. * * Version: @(#)in.h 1.0.1 04/21/93 * * Authors: Original taken from the GNU Project <netinet/in.h> file. * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IN_H #define _LINUX_IN_H #include <linux/errno.h> #include <uapi/linux/in.h> static inline int proto_ports_offset(int proto) { switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_DCCP: case IPPROTO_ESP: /* SPI */ case IPPROTO_SCTP: case IPPROTO_UDPLITE: return 0; case IPPROTO_AH: /* SPI */ return 4; default: return -EINVAL; } } static inline bool ipv4_is_loopback(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x7f000000); } static inline bool ipv4_is_multicast(__be32 addr) { return (addr & htonl(0xf0000000)) == htonl(0xe0000000); } static inline bool ipv4_is_local_multicast(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xe0000000); } static inline bool ipv4_is_lbcast(__be32 addr) { /* limited broadcast */ return addr == htonl(INADDR_BROADCAST); } static inline bool ipv4_is_all_snoopers(__be32 addr) { return addr == htonl(INADDR_ALLSNOOPERS_GROUP); } static inline bool ipv4_is_zeronet(__be32 addr) { return (addr == 0); } /* Special-Use IPv4 Addresses (RFC3330) */ static inline bool ipv4_is_private_10(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x0a000000); } static inline bool ipv4_is_private_172(__be32 addr) { return (addr & htonl(0xfff00000)) == htonl(0xac100000); } static inline bool ipv4_is_private_192(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xc0a80000); } static inline bool ipv4_is_linklocal_169(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000); } static inline bool ipv4_is_anycast_6to4(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0586300); } static inline bool ipv4_is_test_192(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0000200); } static inline bool ipv4_is_test_198(__be32 addr) { return (addr & htonl(0xfffe0000)) == htonl(0xc6120000); } #endif /* _LINUX_IN_H */
2 2 2 2 3149 3134 275 3148 3156 24 25 24 25 169 169 170 170 169 171 165 169 22 22 21 21 22 22 22 22 21 22 12 12 12 7 7 7 7 7 7 2 2 2 2 2 2 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 // SPDX-License-Identifier: GPL-2.0 /* * Fast batching percpu counters. */ #include <linux/percpu_counter.h> #include <linux/mutex.h> #include <linux/init.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/debugobjects.h> #ifdef CONFIG_HOTPLUG_CPU static LIST_HEAD(percpu_counters); static DEFINE_SPINLOCK(percpu_counters_lock); #endif #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER static const struct debug_obj_descr percpu_counter_debug_descr; static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state) { struct percpu_counter *fbc = addr; switch (state) { case ODEBUG_STATE_ACTIVE: percpu_counter_destroy(fbc); debug_object_free(fbc, &percpu_counter_debug_descr); return true; default: return false; } } static const struct debug_obj_descr percpu_counter_debug_descr = { .name = "percpu_counter", .fixup_free = percpu_counter_fixup_free, }; static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { debug_object_init(fbc, &percpu_counter_debug_descr); debug_object_activate(fbc, &percpu_counter_debug_descr); } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { debug_object_deactivate(fbc, &percpu_counter_debug_descr); debug_object_free(fbc, &percpu_counter_debug_descr); } #else /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { } #endif /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); for_each_possible_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); *pcount = 0; } fbc->count = amount; raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_set); /* * Add to a counter while respecting batch size. * * There are 2 implementations, both dealing with the following problem: * * The decision slow path/fast path and the actual update must be atomic. * Otherwise a call in process context could check the current values and * decide that the fast path can be used. If now an interrupt occurs before * the this_cpu_add(), and the interrupt updates this_cpu(*fbc->counters), * then the this_cpu_add() that is executed after the interrupt has completed * can produce values larger than "batch" or even overflows. */ #ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * Safety against interrupts is achieved in 2 ways: * 1. the fast path uses local cmpxchg (note: no lock prefix) * 2. the slow path operates with interrupts disabled */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; unsigned long flags; count = this_cpu_read(*fbc->counters); do { if (unlikely(abs(count + amount) >= batch)) { raw_spin_lock_irqsave(&fbc->lock, flags); /* * Note: by now we might have migrated to another CPU * or the value might have changed. */ count = __this_cpu_read(*fbc->counters); fbc->count += count + amount; __this_cpu_sub(*fbc->counters, count); raw_spin_unlock_irqrestore(&fbc->lock, flags); return; } } while (!this_cpu_try_cmpxchg(*fbc->counters, &count, count + amount)); } #else /* * local_irq_save() is used to make the function irq safe: * - The slow path would be ok as protected by an irq-safe spinlock. * - this_cpu_add would be ok as it is irq-safe by definition. */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; unsigned long flags; local_irq_save(flags); count = __this_cpu_read(*fbc->counters) + amount; if (abs(count) >= batch) { raw_spin_lock(&fbc->lock); fbc->count += count; __this_cpu_sub(*fbc->counters, count - amount); raw_spin_unlock(&fbc->lock); } else { this_cpu_add(*fbc->counters, amount); } local_irq_restore(flags); } #endif EXPORT_SYMBOL(percpu_counter_add_batch); /* * For percpu_counter with a big batch, the devication of its count could * be big, and there is requirement to reduce the deviation, like when the * counter's batch could be runtime decreased to get a better accuracy, * which can be achieved by running this sync function on each CPU. */ void percpu_counter_sync(struct percpu_counter *fbc) { unsigned long flags; s64 count; raw_spin_lock_irqsave(&fbc->lock, flags); count = __this_cpu_read(*fbc->counters); fbc->count += count; __this_cpu_sub(*fbc->counters, count); raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_sync); /* * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive(). * * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums * from CPUs that are in the process of being taken offline. Dying cpus have * been removed from the online mask, but may not have had the hotplug dead * notifier called to fold the percpu count back into the global counter sum. * By including dying CPUs in the iteration mask, we avoid this race condition * so __percpu_counter_sum() just does the right thing when CPUs are being taken * offline. */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { s64 ret; int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } raw_spin_unlock_irqrestore(&fbc->lock, flags); return ret; } EXPORT_SYMBOL(__percpu_counter_sum); int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters, struct lock_class_key *key) { unsigned long flags __maybe_unused; size_t counter_size; s32 __percpu *counters; u32 i; counter_size = ALIGN(sizeof(*counters), __alignof__(*counters)); counters = __alloc_percpu_gfp(nr_counters * counter_size, __alignof__(*counters), gfp); if (!counters) { fbc[0].counters = NULL; return -ENOMEM; } for (i = 0; i < nr_counters; i++) { raw_spin_lock_init(&fbc[i].lock); lockdep_set_class(&fbc[i].lock, key); #ifdef CONFIG_HOTPLUG_CPU INIT_LIST_HEAD(&fbc[i].list); #endif fbc[i].count = amount; fbc[i].counters = (void __percpu *)counters + i * counter_size; debug_percpu_counter_activate(&fbc[i]); } #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); for (i = 0; i < nr_counters; i++) list_add(&fbc[i].list, &percpu_counters); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif return 0; } EXPORT_SYMBOL(__percpu_counter_init_many); void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters) { unsigned long flags __maybe_unused; u32 i; if (WARN_ON_ONCE(!fbc)) return; if (!fbc[0].counters) return; for (i = 0; i < nr_counters; i++) debug_percpu_counter_deactivate(&fbc[i]); #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); for (i = 0; i < nr_counters; i++) list_del(&fbc[i].list); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif free_percpu(fbc[0].counters); for (i = 0; i < nr_counters; i++) fbc[i].counters = NULL; } EXPORT_SYMBOL(percpu_counter_destroy_many); int percpu_counter_batch __read_mostly = 32; EXPORT_SYMBOL(percpu_counter_batch); static int compute_batch_value(unsigned int cpu) { int nr = num_online_cpus(); percpu_counter_batch = max(32, nr*2); return 0; } static int percpu_counter_cpu_dead(unsigned int cpu) { #ifdef CONFIG_HOTPLUG_CPU struct percpu_counter *fbc; compute_batch_value(cpu); spin_lock_irq(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; raw_spin_lock(&fbc->lock); pcount = per_cpu_ptr(fbc->counters, cpu); fbc->count += *pcount; *pcount = 0; raw_spin_unlock(&fbc->lock); } spin_unlock_irq(&percpu_counters_lock); #endif return 0; } /* * Compare counter against given value. * Return 1 if greater, 0 if equal and -1 if less */ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) { s64 count; count = percpu_counter_read(fbc); /* Check to see if rough count will be sufficient for comparison */ if (abs(count - rhs) > (batch * num_online_cpus())) { if (count > rhs) return 1; else return -1; } /* Need to use precise count */ count = percpu_counter_sum(fbc); if (count > rhs) return 1; else if (count < rhs) return -1; else return 0; } EXPORT_SYMBOL(__percpu_counter_compare); /* * Compare counter, and add amount if total is: less than or equal to limit if * amount is positive, or greater than or equal to limit if amount is negative. * Return true if amount is added, or false if total would be beyond the limit. * * Negative limit is allowed, but unusual. * When negative amounts (subs) are given to percpu_counter_limited_add(), * the limit would most naturally be 0 - but other limits are also allowed. * * Overflow beyond S64_MAX is not allowed for: counter, limit and amount * are all assumed to be sane (far from S64_MIN and S64_MAX). */ bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount, s32 batch) { s64 count; s64 unknown; unsigned long flags; bool good = false; if (amount == 0) return true; local_irq_save(flags); unknown = batch * num_online_cpus(); count = __this_cpu_read(*fbc->counters); /* Skip taking the lock when safe */ if (abs(count + amount) <= batch && ((amount > 0 && fbc->count + unknown <= limit) || (amount < 0 && fbc->count - unknown >= limit))) { this_cpu_add(*fbc->counters, amount); local_irq_restore(flags); return true; } raw_spin_lock(&fbc->lock); count = fbc->count + amount; /* Skip percpu_counter_sum() when safe */ if (amount > 0) { if (count - unknown > limit) goto out; if (count + unknown <= limit) good = true; } else { if (count + unknown < limit) goto out; if (count - unknown >= limit) good = true; } if (!good) { s32 *pcount; int cpu; for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { pcount = per_cpu_ptr(fbc->counters, cpu); count += *pcount; } if (amount > 0) { if (count > limit) goto out; } else { if (count < limit) goto out; } good = true; } count = __this_cpu_read(*fbc->counters); fbc->count += count + amount; __this_cpu_sub(*fbc->counters, count); out: raw_spin_unlock(&fbc->lock); local_irq_restore(flags); return good; } static int __init percpu_counter_startup(void) { int ret; ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online", compute_batch_value, NULL); WARN_ON(ret < 0); ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD, "lib/percpu_cnt:dead", NULL, percpu_counter_cpu_dead); WARN_ON(ret < 0); return 0; } module_init(percpu_counter_startup);
2028 2080 2056 2082 2033 2063 2060 2 2067 2 2068 2063 2065 1827 1812 1798 1795 1801 1818 1810 1800 1820 1814 1829 1830 1830 1818 1830 1830 1830 1830 1812 1828 1828 1817 1817 8 1813 16 1816 4 4 4 1787 1803 1793 19 19 18 18 1 1 19 19 18 4 19 19 19 19 573 572 576 573 20 20 256 256 256 256 252 252 256 256 255 255 9 256 254 255 6 6 6 6 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/uaccess.h> #include <linux/fs_struct.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/prefetch.h> #include "mount.h" #include "internal.h" struct prepend_buffer { char *buf; int len; }; #define DECLARE_BUFFER(__name, __buf, __len) \ struct prepend_buffer __name = {.buf = __buf + __len, .len = __len} static char *extract_string(struct prepend_buffer *p) { if (likely(p->len >= 0)) return p->buf; return ERR_PTR(-ENAMETOOLONG); } static bool prepend_char(struct prepend_buffer *p, unsigned char c) { if (likely(p->len > 0)) { p->len--; *--p->buf = c; return true; } p->len = -1; return false; } /* * The source of the prepend data can be an optimistic load * of a dentry name and length. And because we don't hold any * locks, the length and the pointer to the name may not be * in sync if a concurrent rename happens, and the kernel * copy might fault as a result. * * The end result will correct itself when we check the * rename sequence count, but we need to be able to handle * the fault gracefully. */ static bool prepend_copy(void *dst, const void *src, int len) { if (unlikely(copy_from_kernel_nofault(dst, src, len))) { memset(dst, 'x', len); return false; } return true; } static bool prepend(struct prepend_buffer *p, const char *str, int namelen) { // Already overflowed? if (p->len < 0) return false; // Will overflow? if (p->len < namelen) { // Fill as much as possible from the end of the name str += namelen - p->len; p->buf -= p->len; prepend_copy(p->buf, str, p->len); p->len = -1; return false; } // Fits fully p->len -= namelen; p->buf -= namelen; return prepend_copy(p->buf, str, namelen); } /** * prepend_name - prepend a pathname in front of current buffer pointer * @p: prepend buffer which contains buffer pointer and allocated length * @name: name string and length qstr structure * * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to * make sure that either the old or the new name pointer and length are * fetched. However, there may be mismatch between length and pointer. * But since the length cannot be trusted, we need to copy the name very * carefully when doing the prepend_copy(). It also prepends "/" at * the beginning of the name. The sequence number check at the caller will * retry it again when a d_move() does happen. So any garbage in the buffer * due to mismatched pointer and length will be discarded. * * Load acquire is needed to make sure that we see the new name data even * if we might get the length wrong. */ static bool prepend_name(struct prepend_buffer *p, const struct qstr *name) { const char *dname = smp_load_acquire(&name->name); /* ^^^ */ u32 dlen = READ_ONCE(name->len); return prepend(p, dname, dlen) && prepend_char(p, '/'); } static int __prepend_path(const struct dentry *dentry, const struct mount *mnt, const struct path *root, struct prepend_buffer *p) { while (dentry != root->dentry || &mnt->mnt != root->mnt) { const struct dentry *parent = READ_ONCE(dentry->d_parent); if (dentry == mnt->mnt.mnt_root) { struct mount *m = READ_ONCE(mnt->mnt_parent); struct mnt_namespace *mnt_ns; if (likely(mnt != m)) { dentry = READ_ONCE(mnt->mnt_mountpoint); mnt = m; continue; } /* Global root */ mnt_ns = READ_ONCE(mnt->mnt_ns); /* open-coded is_mounted() to use local mnt_ns */ if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns)) return 1; // absolute root else return 2; // detached or not attached yet } if (unlikely(dentry == parent)) /* Escaped? */ return 3; prefetch(parent); if (!prepend_name(p, &dentry->d_name)) break; dentry = parent; } return 0; } /** * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry * @p: prepend buffer which contains buffer pointer and allocated length * * The function will first try to write out the pathname without taking any * lock other than the RCU read lock to make sure that dentries won't go away. * It only checks the sequence number of the global rename_lock as any change * in the dentry's d_seq will be preceded by changes in the rename_lock * sequence number. If the sequence number had been changed, it will restart * the whole pathname back-tracing sequence again by taking the rename_lock. * In this case, there is no need to take the RCU read lock as the recursive * parent pointer references will keep the dentry chain alive as long as no * rename operation is performed. */ static int prepend_path(const struct path *path, const struct path *root, struct prepend_buffer *p) { unsigned seq, m_seq = 0; struct prepend_buffer b; int error; rcu_read_lock(); restart_mnt: read_seqbegin_or_lock(&mount_lock, &m_seq); seq = 0; rcu_read_lock(); restart: b = *p; read_seqbegin_or_lock(&rename_lock, &seq); error = __prepend_path(path->dentry, real_mount(path->mnt), root, &b); if (!(seq & 1)) rcu_read_unlock(); if (need_seqretry(&rename_lock, seq)) { seq = 1; goto restart; } done_seqretry(&rename_lock, seq); if (!(m_seq & 1)) rcu_read_unlock(); if (need_seqretry(&mount_lock, m_seq)) { m_seq = 1; goto restart_mnt; } done_seqretry(&mount_lock, m_seq); if (unlikely(error == 3)) b = *p; if (b.len == p->len) prepend_char(&b, '/'); *p = b; return error; } /** * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry * @buf: buffer to return value in * @buflen: buffer length * * Convert a dentry into an ASCII path name. * * Returns a pointer into the buffer or an error code if the * path was too long. * * "buflen" should be positive. * * If the path is not reachable from the supplied root, return %NULL. */ char *__d_path(const struct path *path, const struct path *root, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); if (unlikely(prepend_path(path, root, &b) > 0)) return NULL; return extract_string(&b); } char *d_absolute_path(const struct path *path, char *buf, int buflen) { struct path root = {}; DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); if (unlikely(prepend_path(path, &root, &b) > 1)) return ERR_PTR(-EINVAL); return extract_string(&b); } static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) { unsigned seq; do { seq = read_seqbegin(&fs->seq); *root = fs->root; } while (read_seqretry(&fs->seq, seq)); } /** * d_path - return the path of a dentry * @path: path to report * @buf: buffer to return value in * @buflen: buffer length * * Convert a dentry into an ASCII path name. If the entry has been deleted * the string " (deleted)" is appended. Note that this is ambiguous. * * Returns a pointer into the buffer or an error code if the path was * too long. Note: Callers should use the returned pointer, not the passed * in buffer, to use the name! The implementation often starts at an offset * into the buffer, and may leave 0 bytes at the start. * * "buflen" should be positive. */ char *d_path(const struct path *path, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); struct path root; /* * We have various synthetic filesystems that never get mounted. On * these filesystems dentries are never used for lookup purposes, and * thus don't need to be hashed. They also don't need a name until a * user wants to identify the object in /proc/pid/fd/. The little hack * below allows us to generate a name for these objects on demand: * * Some pseudo inodes are mountable. When they are mounted * path->dentry == path->mnt->mnt_root. In that case don't call d_dname * and instead have d_path return the mounted path. */ if (path->dentry->d_op && path->dentry->d_op->d_dname && (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root)) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); rcu_read_lock(); get_fs_root_rcu(current->fs, &root); if (unlikely(d_unlinked(path->dentry))) prepend(&b, " (deleted)", 11); else prepend_char(&b, 0); prepend_path(path, &root, &b); rcu_read_unlock(); return extract_string(&b); } EXPORT_SYMBOL(d_path); /* * Helper function for dentry_operations.d_dname() members */ char *dynamic_dname(char *buffer, int buflen, const char *fmt, ...) { va_list args; char temp[64]; int sz; va_start(args, fmt); sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1; va_end(args); if (sz > sizeof(temp) || sz > buflen) return ERR_PTR(-ENAMETOOLONG); buffer += buflen - sz; return memcpy(buffer, temp, sz); } char *simple_dname(struct dentry *dentry, char *buffer, int buflen) { DECLARE_BUFFER(b, buffer, buflen); /* these dentries are never renamed, so d_lock is not needed */ prepend(&b, " (deleted)", 11); prepend(&b, dentry->d_name.name, dentry->d_name.len); prepend_char(&b, '/'); return extract_string(&b); } /* * Write full pathname from the root of the filesystem into the buffer. */ static char *__dentry_path(const struct dentry *d, struct prepend_buffer *p) { const struct dentry *dentry; struct prepend_buffer b; int seq = 0; rcu_read_lock(); restart: dentry = d; b = *p; read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { const struct dentry *parent = dentry->d_parent; prefetch(parent); if (!prepend_name(&b, &dentry->d_name)) break; dentry = parent; } if (!(seq & 1)) rcu_read_unlock(); if (need_seqretry(&rename_lock, seq)) { seq = 1; goto restart; } done_seqretry(&rename_lock, seq); if (b.len == p->len) prepend_char(&b, '/'); return extract_string(&b); } char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); return __dentry_path(dentry, &b); } EXPORT_SYMBOL(dentry_path_raw); char *dentry_path(const struct dentry *dentry, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); if (unlikely(d_unlinked(dentry))) prepend(&b, "//deleted", 10); else prepend_char(&b, 0); return __dentry_path(dentry, &b); } static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, struct path *pwd) { unsigned seq; do { seq = read_seqbegin(&fs->seq); *root = fs->root; *pwd = fs->pwd; } while (read_seqretry(&fs->seq, seq)); } /* * NOTE! The user-level library version returns a * character pointer. The kernel system call just * returns the length of the buffer filled (which * includes the ending '\0' character), or a negative * error value. So libc would do something like * * char *getcwd(char * buf, size_t size) * { * int retval; * * retval = sys_getcwd(buf, size); * if (retval >= 0) * return buf; * errno = -retval; * return NULL; * } */ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) { int error; struct path pwd, root; char *page = __getname(); if (!page) return -ENOMEM; rcu_read_lock(); get_fs_root_and_pwd_rcu(current->fs, &root, &pwd); if (unlikely(d_unlinked(pwd.dentry))) { rcu_read_unlock(); error = -ENOENT; } else { unsigned len; DECLARE_BUFFER(b, page, PATH_MAX); prepend_char(&b, 0); if (unlikely(prepend_path(&pwd, &root, &b) > 0)) prepend(&b, "(unreachable)", 13); rcu_read_unlock(); len = PATH_MAX - b.len; if (unlikely(len > PATH_MAX)) error = -ENAMETOOLONG; else if (unlikely(len > size)) error = -ERANGE; else if (copy_to_user(buf, b.buf, len)) error = -EFAULT; else error = len; } __putname(page); return error; }
176 176 176 21 21 174 3 3 3 6 6 6 6 5 5 1 4 1 1 1 3 3 2 1 1 1 2 4 4 6 58 1 37 13 1 98 97 99 167 199 197 198 198 199 2 1 200 3 200 58 199 197 199 199 197 2 195 23 196 197 165 167 167 167 113 131 130 137 101 130 141 13 165 163 78 77 163 163 161 1 33 18 17 2 2 1 12 186 34 165 34 27 186 148 134 73 186 116 116 107 4 103 74 43 43 42 176 182 142 58 58 58 58 147 224 226 225 223 58 58 21 58 225 225 224 225 147 227 225 173 59 18 129 40 100 225 226 222 223 226 80 100 101 102 1443 1440 1439 93 28 93 1 1 1 1 1 31 2 10 10 10 5 10 9 9 9 8 9 2 8 8 8 8 10 8 8 8 8 8 1 1 1 1 1 1 1 7 8 5 8 2 12 12 12 11 12 12 11 12 12 12 12 12 5 12 12 12 12 204 203 41 3 2 206 2 2 199 23 203 14 196 73 198 1 8 8 1 8 1 1 1 32 32 32 32 32 32 32 32 31 9 9 32 32 32 32 32 31 1 32 32 2 32 32 32 2 32 32 32 32 31 31 32 9 9 9 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 // SPDX-License-Identifier: GPL-2.0-only /* * mm/mmap.c * * Written by obz. * * Address space accounting code <alan@lxorguk.ukuu.org.uk> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/capability.h> #include <linux/init.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/profile.h> #include <linux/export.h> #include <linux/mount.h> #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> #include <linux/mmdebug.h> #include <linux/perf_event.h> #include <linux/audit.h> #include <linux/khugepaged.h> #include <linux/uprobes.h> #include <linux/notifier.h> #include <linux/memory.h> #include <linux/printk.h> #include <linux/userfaultfd_k.h> #include <linux/moduleparam.h> #include <linux/pkeys.h> #include <linux/oom.h> #include <linux/sched/mm.h> #include <linux/ksm.h> #include <linux/memfd.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlb.h> #include <asm/mmu_context.h> #define CREATE_TRACE_POINTS #include <trace/events/mmap.h> #include "internal.h" #ifndef arch_mmap_check #define arch_mmap_check(addr, len, flags) (0) #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX; int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN; const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; #endif static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); /* Update vma->vm_page_prot to reflect vma->vm_flags. */ void vma_set_page_prot(struct vm_area_struct *vma) { vm_flags_t vm_flags = vma->vm_flags; pgprot_t vm_page_prot; vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); if (vma_wants_writenotify(vma, vm_page_prot)) { vm_flags &= ~VM_SHARED; vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); } /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ WRITE_ONCE(vma->vm_page_prot, vm_page_prot); } /* * check_brk_limits() - Use platform specific check of range & verify mlock * limits. * @addr: The address to check * @len: The size of increase. * * Return: 0 on success. */ static int check_brk_limits(unsigned long addr, unsigned long len) { unsigned long mapped_addr; mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); if (IS_ERR_VALUE(mapped_addr)) return mapped_addr; return mlock_future_ok(current->mm, current->mm->def_flags, len) ? 0 : -EAGAIN; } SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; struct vm_area_struct *brkvma, *next = NULL; unsigned long min_brk; bool populate = false; LIST_HEAD(uf); struct vma_iterator vmi; if (mmap_write_lock_killable(mm)) return -EINTR; origbrk = mm->brk; min_brk = mm->start_brk; #ifdef CONFIG_COMPAT_BRK /* * CONFIG_COMPAT_BRK can still be overridden by setting * randomize_va_space to 2, which will still cause mm->start_brk * to be arbitrarily shifted */ if (!current->brk_randomized) min_brk = mm->end_data; #endif if (brk < min_brk) goto out; /* * Check against rlimit here. If this check is done later after the test * of oldbrk with newbrk then it can escape the test and let the data * segment grow beyond its set limit the in case where the limit is * not page aligned -Ram Gupta */ if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, mm->end_data, mm->start_data)) goto out; newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); if (oldbrk == newbrk) { mm->brk = brk; goto success; } /* Always allow shrinking brk. */ if (brk <= mm->brk) { /* Search one past newbrk */ vma_iter_init(&vmi, mm, newbrk); brkvma = vma_find(&vmi, oldbrk); if (!brkvma || brkvma->vm_start >= oldbrk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. * do_vmi_align_munmap() will drop the lock on success, so * update it before calling do_vma_munmap(). */ mm->brk = brk; if (do_vmi_align_munmap(&vmi, brkvma, mm, newbrk, oldbrk, &uf, /* unlock = */ true)) goto out; goto success_unlocked; } if (check_brk_limits(oldbrk, newbrk - oldbrk)) goto out; /* * Only check if the next VMA is within the stack_guard_gap of the * expansion area */ vma_iter_init(&vmi, mm, oldbrk); next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; brkvma = vma_prev_limit(&vmi, mm->start_brk); /* Ok, looks good - let it rip. */ if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; mm->brk = brk; if (mm->def_flags & VM_LOCKED) populate = true; success: mmap_write_unlock(mm); success_unlocked: userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; out: mm->brk = origbrk; mmap_write_unlock(mm); return origbrk; } /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr */ static inline unsigned long round_hint_to_min(unsigned long hint) { hint &= PAGE_MASK; if (((void *)hint != NULL) && (hint < mmap_min_addr)) return PAGE_ALIGN(mmap_min_addr); return hint; } bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) return true; locked_pages = bytes >> PAGE_SHIFT; locked_pages += mm->locked_vm; limit_pages = rlimit(RLIMIT_MEMLOCK); limit_pages >>= PAGE_SHIFT; return locked_pages <= limit_pages; } static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) { if (S_ISREG(inode->i_mode)) return MAX_LFS_FILESIZE; if (S_ISBLK(inode->i_mode)) return MAX_LFS_FILESIZE; if (S_ISSOCK(inode->i_mode)) return MAX_LFS_FILESIZE; /* Special "we do even unsigned file positions" case */ if (file->f_op->fop_flags & FOP_UNSIGNED_OFFSET) return 0; /* Yes, random drivers might want more. But I'm tired of buggy drivers */ return ULONG_MAX; } static inline bool file_mmap_ok(struct file *file, struct inode *inode, unsigned long pgoff, unsigned long len) { u64 maxsize = file_mmap_size_max(file, inode); if (maxsize && len > maxsize) return false; maxsize -= len; if (pgoff > maxsize >> PAGE_SHIFT) return false; return true; } /** * do_mmap() - Perform a userland memory mapping into the current process * address space of length @len with protection bits @prot, mmap flags @flags * (from which VMA flags will be inferred), and any additional VMA flags to * apply @vm_flags. If this is a file-backed mapping then the file is specified * in @file and page offset into the file via @pgoff. * * This function does not perform security checks on the file and assumes, if * @uf is non-NULL, the caller has provided a list head to track unmap events * for userfaultfd @uf. * * It also simply indicates whether memory population is required by setting * @populate, which must be non-NULL, expecting the caller to actually perform * this task itself if appropriate. * * This function will invoke architecture-specific (and if provided and * relevant, file system-specific) logic to determine the most appropriate * unmapped area in which to place the mapping if not MAP_FIXED. * * Callers which require userland mmap() behaviour should invoke vm_mmap(), * which is also exported for module use. * * Those which require this behaviour less security checks, userfaultfd and * populate behaviour, and who handle the mmap write lock themselves, should * call this function. * * Note that the returned address may reside within a merged VMA if an * appropriate merge were to take place, so it doesn't necessarily specify the * start of a VMA, rather only the start of a valid mapped range of length * @len bytes, rounded down to the nearest page size. * * The caller must write-lock current->mm->mmap_lock. * * @file: An optional struct file pointer describing the file which is to be * mapped, if a file-backed mapping. * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the * address at which to perform this mapping. See mmap (2) for details. Must be * page-aligned. * @len: The length of the mapping. Will be page-aligned and must be at least 1 * page in size. * @prot: Protection bits describing access required to the mapping. See mmap * (2) for details. * @flags: Flags specifying how the mapping should be performed, see mmap (2) * for details. * @vm_flags: VMA flags which should be set by default, or 0 otherwise. * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise. * @populate: A pointer to a value which will be set to 0 if no population of * the range is required, or the number of bytes to populate if it is. Must be * non-NULL. See mmap (2) for details as to under what circumstances population * of the range occurs. * @uf: An optional pointer to a list head to track userfaultfd unmap events * should unmapping events arise. If provided, it is up to the caller to manage * this. * * Returns: Either an error, or the address at which the requested mapping has * been performed. */ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf) { struct mm_struct *mm = current->mm; int pkey = 0; *populate = 0; mmap_assert_write_locked(mm); if (!len) return -EINVAL; /* * Does the application expect PROT_READ to imply PROT_EXEC? * * (the exception is when the underlying filesystem is noexec * mounted, in which case we don't add PROT_EXEC.) */ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; /* force arch specific MAP_FIXED handling in get_unmapped_area */ if (flags & MAP_FIXED_NOREPLACE) flags |= MAP_FIXED; if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); /* Careful about overflows.. */ len = PAGE_ALIGN(len); if (!len) return -ENOMEM; /* offset overflow? */ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) return -EOVERFLOW; /* Too many mappings? */ if (mm->map_count > sysctl_max_map_count) return -ENOMEM; /* * addr is returned from get_unmapped_area, * There are two cases: * 1> MAP_FIXED == false * unallocated memory, no need to check sealing. * 1> MAP_FIXED == true * sealing is checked inside mmap_region when * do_vmi_munmap is called. */ if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) pkey = 0; } /* Do simple checking here so the lower-level routines won't have * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags); if (IS_ERR_VALUE(addr)) return addr; if (flags & MAP_FIXED_NOREPLACE) { if (find_vma_intersection(mm, addr, addr + len)) return -EEXIST; } if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; if (!mlock_future_ok(mm, vm_flags, len)) return -EAGAIN; if (file) { struct inode *inode = file_inode(file); unsigned long flags_mask; int err; if (!file_mmap_ok(file, inode, pgoff, len)) return -EOVERFLOW; flags_mask = LEGACY_MAP_MASK; if (file->f_op->fop_flags & FOP_MMAP_SYNC) flags_mask |= MAP_SYNC; switch (flags & MAP_TYPE) { case MAP_SHARED: /* * Force use of MAP_SHARED_VALIDATE with non-legacy * flags. E.g. MAP_SYNC is dangerous to use with * MAP_SHARED as you don't know which consistency model * you will get. We silently ignore unsupported flags * with MAP_SHARED to preserve backward compatibility. */ flags &= LEGACY_MAP_MASK; fallthrough; case MAP_SHARED_VALIDATE: if (flags & ~flags_mask) return -EOPNOTSUPP; if (prot & PROT_WRITE) { if (!(file->f_mode & FMODE_WRITE)) return -EACCES; if (IS_SWAPFILE(file->f_mapping->host)) return -ETXTBSY; } /* * Make sure we don't allow writing to an append-only * file.. */ if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) return -EACCES; vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; if (path_noexec(&file->f_path)) { if (vm_flags & VM_EXEC) return -EPERM; vm_flags &= ~VM_MAYEXEC; } if (!can_mmap_file(file)) return -ENODEV; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; break; default: return -EINVAL; } /* * Check to see if we are violating any seals and update VMA * flags if necessary to avoid future seal violations. */ err = memfd_check_seals_mmap(file, &vm_flags); if (err) return (unsigned long)err; } else { switch (flags & MAP_TYPE) { case MAP_SHARED: if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; /* * Ignore pgoff. */ pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_DROPPABLE: if (VM_DROPPABLE == VM_NONE) return -ENOTSUPP; /* * A locked or stack area makes no sense to be droppable. * * Also, since droppable pages can just go away at any time * it makes no sense to copy them on fork or dump them. * * And don't attempt to combine with hugetlb for now. */ if (flags & (MAP_LOCKED | MAP_HUGETLB)) return -EINVAL; if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP)) return -EINVAL; vm_flags |= VM_DROPPABLE; /* * If the pages can be dropped, then it doesn't make * sense to reserve them. */ vm_flags |= VM_NORESERVE; /* * Likewise, they're volatile enough that they * shouldn't survive forks or coredumps. */ vm_flags |= VM_WIPEONFORK | VM_DONTDUMP; fallthrough; case MAP_PRIVATE: /* * Set pgoff according to addr for anon_vma. */ pgoff = addr >> PAGE_SHIFT; break; default: return -EINVAL; } } /* * Set 'VM_NORESERVE' if we should not account for the * memory use of this mapping. */ if (flags & MAP_NORESERVE) { /* We honor MAP_NORESERVE if allowed to overcommit */ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) vm_flags |= VM_NORESERVE; /* hugetlb applies strict overcommit unless MAP_NORESERVE */ if (file && is_file_hugepages(file)) vm_flags |= VM_NORESERVE; } addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) *populate = len; return addr; } unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { struct file *file = NULL; unsigned long retval; if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); if (!file) return -EBADF; if (is_file_hugepages(file)) { len = ALIGN(len, huge_page_size(hstate_file(file))); } else if (unlikely(flags & MAP_HUGETLB)) { retval = -EINVAL; goto out_fput; } } else if (flags & MAP_HUGETLB) { struct hstate *hs; hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (!hs) return -EINVAL; len = ALIGN(len, huge_page_size(hs)); /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) return PTR_ERR(file); } retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); out_fput: if (file) fput(file); return retval; } SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) { return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } #ifdef __ARCH_WANT_SYS_OLD_MMAP struct mmap_arg_struct { unsigned long addr; unsigned long len; unsigned long prot; unsigned long flags; unsigned long fd; unsigned long offset; }; SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) { struct mmap_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; if (offset_in_page(a.offset)) return -EINVAL; return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ /* * Determine if the allocation needs to ensure that there is no * existing mapping within it's guard gaps, for use as start_gap. */ static inline unsigned long stack_guard_placement(vm_flags_t vm_flags) { if (vm_flags & VM_SHADOW_STACK) return PAGE_SIZE; return 0; } /* * Search for an unmapped address range. * * We are looking for a range that: * - does not intersect with any VMA; * - is contained within the [low_limit, high_limit) interval; * - is at least the desired size. * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) */ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) { unsigned long addr; if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) addr = unmapped_area_topdown(info); else addr = unmapped_area(info); trace_vm_unmapped_area(addr, info); return addr; } /* Get an address range which is currently unmapped. * For shmat() with addr=0. * * Ugly calling convention alert: * Return value with the low bits set means error value, * ie * if (ret & ~PAGE_MASK) * error = ret; * * This function "knows" that -ENOMEM has the bits set. */ unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vm_unmapped_area_info info = {}; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) return addr; if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; } info.length = len; info.low_limit = mm->mmap_base; info.high_limit = mmap_end; info.start_gap = stack_guard_placement(vm_flags); if (filp && is_file_hugepages(filp)) info.align_mask = huge_page_mask_align(filp); return vm_unmapped_area(&info); } #ifndef HAVE_ARCH_UNMAPPED_AREA unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { return generic_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); } #endif /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): */ unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; struct vm_unmapped_area_info info = {}; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); /* requested length too big for entire address space */ if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) return addr; /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; } info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); info.start_gap = stack_guard_placement(vm_flags); if (filp && is_file_hugepages(filp)) info.align_mask = huge_page_mask_align(filp); addr = vm_unmapped_area(&info); /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ if (offset_in_page(addr)) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = mmap_end; addr = vm_unmapped_area(&info); } return addr; } #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); } #endif unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { if (mm_flags_test(MMF_TOPDOWN, current->mm)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); } unsigned long __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL; unsigned long error = arch_mmap_check(addr, len, flags); if (error) return error; /* Careful about overflows.. */ if (len > TASK_SIZE) return -ENOMEM; if (file) { if (file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; } else if (flags & MAP_SHARED) { /* * mmap_region() will call shmem_zero_setup() to create a file, * so use shmem's get_unmapped_area in case it can be huge. */ get_area = shmem_get_unmapped_area; } /* Always treat pgoff as zero for anonymous memory. */ if (!file) pgoff = 0; if (get_area) { addr = get_area(file, addr, len, pgoff, flags); } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file && !addr /* no hint */ && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ addr = thp_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } else { addr = mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } if (IS_ERR_VALUE(addr)) return addr; if (addr > TASK_SIZE - len) return -ENOMEM; if (offset_in_page(addr)) return -EINVAL; error = security_mmap_addr(addr); return error ? error : addr; } unsigned long mm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, 0); } EXPORT_SYMBOL(mm_get_unmapped_area); /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. * @start_addr: The inclusive start user address. * @end_addr: The exclusive end user address. * * Returns: The first VMA within the provided range, %NULL otherwise. Assumes * start_addr < end_addr. */ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr) { unsigned long index = start_addr; mmap_assert_locked(mm); return mt_find(&mm->mm_mt, &index, end_addr - 1); } EXPORT_SYMBOL(find_vma_intersection); /** * find_vma() - Find the VMA for a given address, or the next VMA. * @mm: The mm_struct to check * @addr: The address * * Returns: The VMA associated with addr, or the next VMA. * May return %NULL in the case of no VMA at addr or above. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { unsigned long index = addr; mmap_assert_locked(mm); return mt_find(&mm->mm_mt, &index, ULONG_MAX); } EXPORT_SYMBOL(find_vma); /** * find_vma_prev() - Find the VMA for a given address, or the next vma and * set %pprev to the previous VMA, if any. * @mm: The mm_struct to check * @addr: The address * @pprev: The pointer to set to the previous VMA * * Note that RCU lock is missing here since the external mmap_lock() is used * instead. * * Returns: The VMA associated with @addr, or the next vma. * May return %NULL in the case of no vma at addr or above. */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, addr); vma = vma_iter_load(&vmi); *pprev = vma_prev(&vmi); if (!vma) vma = vma_next(&vmi); return vma; } /* enforced gap between the expanding stack and other mappings. */ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; static int __init cmdline_parse_stack_guard_gap(char *p) { unsigned long val; char *endptr; val = simple_strtoul(p, &endptr, 10); if (!*endptr) stack_guard_gap = val << PAGE_SHIFT; return 1; } __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); #ifdef CONFIG_STACK_GROWSUP int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { return expand_upwards(vma, address); } struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma, *prev; addr &= PAGE_MASK; vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; if (!prev) return NULL; if (expand_stack_locked(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); return prev; } #else int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { return expand_downwards(vma, address); } struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; unsigned long start; addr &= PAGE_MASK; vma = find_vma(mm, addr); if (!vma) return NULL; if (vma->vm_start <= addr) return vma; start = vma->vm_start; if (expand_stack_locked(vma, addr)) return NULL; if (vma->vm_flags & VM_LOCKED) populate_vma_page_range(vma, addr, start, NULL); return vma; } #endif #if defined(CONFIG_STACK_GROWSUP) #define vma_expand_up(vma,addr) expand_upwards(vma, addr) #define vma_expand_down(vma, addr) (-EFAULT) #else #define vma_expand_up(vma,addr) (-EFAULT) #define vma_expand_down(vma, addr) expand_downwards(vma, addr) #endif /* * expand_stack(): legacy interface for page faulting. Don't use unless * you have to. * * This is called with the mm locked for reading, drops the lock, takes * the lock for writing, tries to look up a vma again, expands it if * necessary, and downgrades the lock to reading again. * * If no vma is found or it can't be expanded, it returns NULL and has * dropped the lock. */ struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma, *prev; mmap_read_unlock(mm); if (mmap_write_lock_killable(mm)) return NULL; vma = find_vma_prev(mm, addr, &prev); if (vma && vma->vm_start <= addr) goto success; if (prev && !vma_expand_up(prev, addr)) { vma = prev; goto success; } if (vma && !vma_expand_down(vma, addr)) goto success; mmap_write_unlock(mm); return NULL; success: mmap_write_downgrade(mm); return vma; } /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. * @mm: The mm_struct * @start: The start address to munmap * @len: The length to be munmapped. * @uf: The userfaultfd list_head * * Return: 0 on success, error otherwise. */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { VMA_ITERATOR(vmi, mm, start); return do_vmi_munmap(&vmi, mm, start, len, uf, false); } int vm_munmap(unsigned long start, size_t len) { return __vm_munmap(start, len, false); } EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { addr = untagged_addr(addr); return __vm_munmap(addr, len, true); } /* * Emulation of deprecated remap_file_pages() syscall. */ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long, prot, unsigned long, pgoff, unsigned long, flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long populate = 0; unsigned long ret = -EINVAL; struct file *file; vm_flags_t vm_flags; pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n", current->comm, current->pid); if (prot) return ret; start = start & PAGE_MASK; size = size & PAGE_MASK; if (start + size <= start) return ret; /* Does pgoff wrap? */ if (pgoff + (size >> PAGE_SHIFT) < pgoff) return ret; if (mmap_read_lock_killable(mm)) return -EINTR; /* * Look up VMA under read lock first so we can perform the security * without holding locks (which can be problematic). We reacquire a * write lock later and check nothing changed underneath us. */ vma = vma_lookup(mm, start); if (!vma || !(vma->vm_flags & VM_SHARED)) { mmap_read_unlock(mm); return -EINVAL; } prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; flags &= MAP_NONBLOCK; flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; /* Save vm_flags used to calculate prot and flags, and recheck later. */ vm_flags = vma->vm_flags; file = get_file(vma->vm_file); mmap_read_unlock(mm); /* Call outside mmap_lock to be consistent with other callers. */ ret = security_mmap_file(file, prot, flags); if (ret) { fput(file); return ret; } ret = -EINVAL; /* OK security check passed, take write lock + let it rip. */ if (mmap_write_lock_killable(mm)) { fput(file); return -EINTR; } vma = vma_lookup(mm, start); if (!vma) goto out; /* Make sure things didn't change under us. */ if (vma->vm_flags != vm_flags) goto out; if (vma->vm_file != file) goto out; if (start + size > vma->vm_end) { VMA_ITERATOR(vmi, mm, vma->vm_end); struct vm_area_struct *next, *prev = vma; for_each_vma_range(vmi, next, start + size) { /* hole between vmas ? */ if (next->vm_start != prev->vm_end) goto out; if (next->vm_file != vma->vm_file) goto out; if (next->vm_flags != vma->vm_flags) goto out; if (start + size <= next->vm_end) break; prev = next; } if (!next) goto out; } ret = do_mmap(vma->vm_file, start, size, prot, flags, 0, pgoff, &populate, NULL); out: mmap_write_unlock(mm); fput(file); if (populate) mm_populate(ret, populate); if (!IS_ERR_VALUE(ret)) ret = 0; return ret; } int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; unsigned long len; int ret; bool populate; LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, addr); len = PAGE_ALIGN(request); if (len < request) return -ENOMEM; if (!len) return 0; /* Until we need other flags, refuse anything except VM_EXEC. */ if ((vm_flags & (~VM_EXEC)) != 0) return -EINVAL; if (mmap_write_lock_killable(mm)) return -EINTR; ret = check_brk_limits(addr, len); if (ret) goto limits_failed; ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0); if (ret) goto munmap_failed; vma = vma_prev(&vmi); ret = do_brk_flags(&vmi, vma, addr, len, vm_flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); return ret; munmap_failed: limits_failed: mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(vm_brk_flags); /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; VMA_ITERATOR(vmi, mm, 0); int count = 0; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); mmap_read_lock(mm); arch_exit_mmap(mm); vma = vma_next(&vmi); if (!vma || unlikely(xa_is_zero(vma))) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); mmap_write_lock(mm); goto destroy; } flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX); mmap_read_unlock(mm); /* * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper * because the memory has been already freed. */ mm_flags_set(MMF_OOM_SKIP, mm); mmap_write_lock(mm); mt_clear_in_rcu(&mm->mm_mt); vma_iter_set(&vmi, vma->vm_end); free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING, true); tlb_finish_mmu(&tlb); /* * Walk the list again, actually closing and freeing it, with preemption * enabled, without holding any MM locks besides the unreachable * mmap_write_lock. */ vma_iter_set(&vmi, vma->vm_end); do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma_mark_detached(vma); remove_vma(vma); count++; cond_resched(); vma = vma_next(&vmi); } while (vma && likely(!xa_is_zero(vma))); BUG_ON(count != mm->map_count); trace_exit_mmap(mm); destroy: __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } /* * Return true if the calling process may expand its vm space by the passed * number of pages */ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) { if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) return false; if (is_data_mapping(flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { /* Workaround for Valgrind */ if (rlimit(RLIMIT_DATA) == 0 && mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT) return true; pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n", current->comm, current->pid, (mm->data_vm + npages) << PAGE_SHIFT, rlimit(RLIMIT_DATA), ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data"); if (!ignore_rlimit_data) return false; } return true; } void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) { WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); if (is_exec_mapping(flags)) mm->exec_vm += npages; else if (is_stack_mapping(flags)) mm->stack_vm += npages; else if (is_data_mapping(flags)) mm->data_vm += npages; } static vm_fault_t special_mapping_fault(struct vm_fault *vmf); /* * Close hook, called for unmap() and on the old vma for mremap(). * * Having a close hook prevents vma merging regardless of flags. */ static void special_mapping_close(struct vm_area_struct *vma) { const struct vm_special_mapping *sm = vma->vm_private_data; if (sm->close) sm->close(sm, vma); } static const char *special_mapping_name(struct vm_area_struct *vma) { return ((struct vm_special_mapping *)vma->vm_private_data)->name; } static int special_mapping_mremap(struct vm_area_struct *new_vma) { struct vm_special_mapping *sm = new_vma->vm_private_data; if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) return -EFAULT; if (sm->mremap) return sm->mremap(sm, new_vma); return 0; } static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr) { /* * Forbid splitting special mappings - kernel has expectations over * the number of pages in mapping. Together with VM_DONTEXPAND * the size of vma should stay the same over the special mapping's * lifetime. */ return -EINVAL; } static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, .mremap = special_mapping_mremap, .name = special_mapping_name, /* vDSO code relies that VVAR can't be accessed remotely */ .access = NULL, .may_split = special_mapping_split, }; static vm_fault_t special_mapping_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; struct page **pages; struct vm_special_mapping *sm = vma->vm_private_data; if (sm->fault) return sm->fault(sm, vmf->vma, vmf); pages = sm->pages; for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) pgoff--; if (*pages) { struct page *page = *pages; get_page(page); vmf->page = page; return 0; } return VM_FAULT_SIGBUS; } static struct vm_area_struct *__install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, vm_flags_t vm_flags, void *priv, const struct vm_operations_struct *ops) { int ret; struct vm_area_struct *vma; vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); vma_set_range(vma, addr, addr + len, 0); vm_flags |= mm->def_flags | VM_DONTEXPAND; if (pgtable_supports_soft_dirty()) vm_flags |= VM_SOFTDIRTY; vm_flags_init(vma, vm_flags & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = ops; vma->vm_private_data = priv; ret = insert_vm_struct(mm, vma); if (ret) goto out; vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); perf_event_mmap(vma); return vma; out: vm_area_free(vma); return ERR_PTR(ret); } bool vma_is_special_mapping(const struct vm_area_struct *vma, const struct vm_special_mapping *sm) { return vma->vm_private_data == sm && vma->vm_ops == &special_mapping_vmops; } /* * Called with mm->mmap_lock held for writing. * Insert a new vma covering the given region, with the given flags. * Its pages are supplied by the given array of struct page *. * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. * The region past the last page supplied will always produce SIGBUS. * The array pointer and the pages it points to are assumed to stay alive * for as long as this mapping might exist. */ struct vm_area_struct *_install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, vm_flags_t vm_flags, const struct vm_special_mapping *spec) { return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec, &special_mapping_vmops); } #ifdef CONFIG_SYSCTL #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) int sysctl_legacy_va_layout; #endif static const struct ctl_table mmap_table[] = { { .procname = "max_map_count", .data = &sysctl_max_map_count, .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) { .procname = "legacy_va_layout", .data = &sysctl_legacy_va_layout, .maxlen = sizeof(sysctl_legacy_va_layout), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS { .procname = "mmap_rnd_bits", .data = &mmap_rnd_bits, .maxlen = sizeof(mmap_rnd_bits), .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&mmap_rnd_bits_min, .extra2 = (void *)&mmap_rnd_bits_max, }, #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS { .procname = "mmap_rnd_compat_bits", .data = &mmap_rnd_compat_bits, .maxlen = sizeof(mmap_rnd_compat_bits), .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&mmap_rnd_compat_bits_min, .extra2 = (void *)&mmap_rnd_compat_bits_max, }, #endif }; #endif /* CONFIG_SYSCTL */ /* * initialise the percpu counter for VM, initialise VMA state. */ void __init mmap_init(void) { int ret; ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); #ifdef CONFIG_SYSCTL register_sysctl_init("vm", mmap_table); #endif vma_state_init(); } /* * Initialise sysctl_user_reserve_kbytes. * * This is intended to prevent a user from starting a single memory hogging * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER * mode. * * The default value is min(3% of free memory, 128MB) * 128MB is enough to recover with sshd/login, bash, and top/kill. */ static int init_user_reserve(void) { unsigned long free_kbytes; free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K); return 0; } subsys_initcall(init_user_reserve); /* * Initialise sysctl_admin_reserve_kbytes. * * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin * to log in and kill a memory hogging process. * * Systems with more than 256MB will reserve 8MB, enough to recover * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will * only reserve 3% of free pages by default. */ static int init_admin_reserve(void) { unsigned long free_kbytes; free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K); return 0; } subsys_initcall(init_admin_reserve); /* * Reinititalise user and admin reserves if memory is added or removed. * * The default user reserve max is 128MB, and the default max for the * admin reserve is 8MB. These are usually, but not always, enough to * enable recovery from a memory hogging process using login/sshd, a shell, * and tools like top. It may make sense to increase or even disable the * reserve depending on the existence of swap or variations in the recovery * tools. So, the admin may have changed them. * * If memory is added and the reserves have been eliminated or increased above * the default max, then we'll trust the admin. * * If memory is removed and there isn't enough free memory, then we * need to reset the reserves. * * Otherwise keep the reserve set by the admin. */ static int reserve_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { unsigned long tmp, free_kbytes; switch (action) { case MEM_ONLINE: /* Default max is 128MB. Leave alone if modified by operator. */ tmp = sysctl_user_reserve_kbytes; if (tmp > 0 && tmp < SZ_128K) init_user_reserve(); /* Default max is 8MB. Leave alone if modified by operator. */ tmp = sysctl_admin_reserve_kbytes; if (tmp > 0 && tmp < SZ_8K) init_admin_reserve(); break; case MEM_OFFLINE: free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); if (sysctl_user_reserve_kbytes > free_kbytes) { init_user_reserve(); pr_info("vm.user_reserve_kbytes reset to %lu\n", sysctl_user_reserve_kbytes); } if (sysctl_admin_reserve_kbytes > free_kbytes) { init_admin_reserve(); pr_info("vm.admin_reserve_kbytes reset to %lu\n", sysctl_admin_reserve_kbytes); } break; default: break; } return NOTIFY_OK; } static int __meminit init_reserve_notifier(void) { if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI)) pr_err("Failed registering memory add/remove notifier for admin reserve\n"); return 0; } subsys_initcall(init_reserve_notifier); /* * Obtain a read lock on mm->mmap_lock, if the specified address is below the * start of the VMA, the intent is to perform a write, and it is a * downward-growing stack, then attempt to expand the stack to contain it. * * This function is intended only for obtaining an argument page from an ELF * image, and is almost certainly NOT what you want to use for any other * purpose. * * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the * VMA referenced must not be linked in any user-visible tree, i.e. it must be a * new VMA being mapped. * * The function assumes that addr is either contained within the VMA or below * it, and makes no attempt to validate this value beyond that. * * Returns true if the read lock was obtained and a stack was perhaps expanded, * false if the stack expansion failed. * * On stack expansion the function temporarily acquires an mmap write lock * before downgrading it. */ bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *new_vma, unsigned long addr, bool write) { if (!write || addr >= new_vma->vm_start) { mmap_read_lock(mm); return true; } if (!(new_vma->vm_flags & VM_GROWSDOWN)) return false; mmap_write_lock(mm); if (expand_downwards(new_vma, addr)) { mmap_write_unlock(mm); return false; } mmap_write_downgrade(mm); return true; } __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp; int retval; unsigned long charge = 0; LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, 0); if (mmap_write_lock_killable(oldmm)) return -EINTR; flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* * Not linked in yet - no deadlock potential: */ mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ dup_mm_exe_file(mm, oldmm); mm->total_vm = oldmm->total_vm; mm->data_vm = oldmm->data_vm; mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; /* Use __mt_dup() to efficiently build an identical maple tree. */ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); if (unlikely(retval)) goto out; mt_clear_in_rcu(vmi.mas.tree); for_each_vma(vmi, mpnt) { struct file *file; retval = vma_start_write_killable(mpnt); if (retval < 0) goto loop_out; if (mpnt->vm_flags & VM_DONTCOPY) { retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, mpnt->vm_end, GFP_KERNEL); if (retval) goto loop_out; vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; } tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; tmp->vm_mm = mm; retval = dup_userfaultfd(tmp, &uf); if (retval) goto fail_nomem_anon_vma_fork; if (tmp->vm_flags & VM_WIPEONFORK) { /* * VM_WIPEONFORK gets a clean slate in the child. * Don't prepare anon_vma until fault since we don't * copy page for current vma. */ tmp->anon_vma = NULL; } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; vm_flags_clear(tmp, VM_LOCKED_MASK); /* * Copy/update hugetlb private vma information. */ if (is_vm_hugetlb_page(tmp)) hugetlb_dup_vma_private(tmp); /* * Link the vma into the MT. After using __mt_dup(), memory * allocation is not necessary here, so it cannot fail. */ vma_iter_bulk_store(&vmi, tmp); mm->map_count++; if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); file = tmp->vm_file; if (file) { struct address_space *mapping = file->f_mapping; get_file(file); i_mmap_lock_write(mapping); if (vma_is_shared_maywrite(tmp)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_interval_tree_insert_after(tmp, mpnt, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } if (!(tmp->vm_flags & VM_WIPEONFORK)) retval = copy_page_range(tmp, mpnt); if (retval) { mpnt = vma_next(&vmi); goto loop_out; } } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: vma_iter_free(&vmi); if (!retval) { mt_set_in_rcu(vmi.mas.tree); ksm_fork(mm, oldmm); khugepaged_fork(mm, oldmm); } else { /* * The entire maple tree has already been duplicated. If the * mmap duplication fails, mark the failure point with * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, * stop releasing VMAs that have not been duplicated after this * point. */ if (mpnt) { mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); mas_store(&vmi.mas, XA_ZERO_ENTRY); /* Avoid OOM iterating a broken tree */ mm_flags_set(MMF_OOM_SKIP, mm); } /* * The mm_struct is going to exit, but the locks will be dropped * first. Set the mm_struct as unstable is advisable as it is * not fully initialised. */ mm_flags_set(MMF_UNSTABLE, mm); } out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); mmap_write_unlock(oldmm); if (!retval) dup_userfaultfd_complete(&uf); else dup_userfaultfd_fail(&uf); return retval; fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: vm_area_free(tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); goto loop_out; }
7 7 7 14 13 14 14 17 18 16 17 19 19 20 19 20 20 19 19 4 3 4 14 14 14 1 1 1 1 7 7 1 1 1 1 1 7 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 // SPDX-License-Identifier: GPL-2.0 #include "blk-rq-qos.h" /* * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, * false if 'v' + 1 would be bigger than 'below'. */ static bool atomic_inc_below(atomic_t *v, unsigned int below) { unsigned int cur = atomic_read(v); do { if (cur >= below) return false; } while (!atomic_try_cmpxchg(v, &cur, cur + 1)); return true; } bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit) { return atomic_inc_below(&rq_wait->inflight, limit); } void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio) { do { if (rqos->ops->cleanup) rqos->ops->cleanup(rqos, bio); rqos = rqos->next; } while (rqos); } void __rq_qos_done(struct rq_qos *rqos, struct request *rq) { do { if (rqos->ops->done) rqos->ops->done(rqos, rq); rqos = rqos->next; } while (rqos); } void __rq_qos_issue(struct rq_qos *rqos, struct request *rq) { do { if (rqos->ops->issue) rqos->ops->issue(rqos, rq); rqos = rqos->next; } while (rqos); } void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq) { do { if (rqos->ops->requeue) rqos->ops->requeue(rqos, rq); rqos = rqos->next; } while (rqos); } void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio) { do { if (rqos->ops->throttle) rqos->ops->throttle(rqos, bio); rqos = rqos->next; } while (rqos); } void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) { do { if (rqos->ops->track) rqos->ops->track(rqos, rq, bio); rqos = rqos->next; } while (rqos); } void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio) { do { if (rqos->ops->merge) rqos->ops->merge(rqos, rq, bio); rqos = rqos->next; } while (rqos); } void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio) { do { if (rqos->ops->done_bio) rqos->ops->done_bio(rqos, bio); rqos = rqos->next; } while (rqos); } void __rq_qos_queue_depth_changed(struct rq_qos *rqos) { do { if (rqos->ops->queue_depth_changed) rqos->ops->queue_depth_changed(rqos); rqos = rqos->next; } while (rqos); } /* * Return true, if we can't increase the depth further by scaling */ bool rq_depth_calc_max_depth(struct rq_depth *rqd) { unsigned int depth; bool ret = false; /* * For QD=1 devices, this is a special case. It's important for those * to have one request ready when one completes, so force a depth of * 2 for those devices. On the backend, it'll be a depth of 1 anyway, * since the device can't have more than that in flight. If we're * scaling down, then keep a setting of 1/1/1. */ if (rqd->queue_depth == 1) { if (rqd->scale_step > 0) rqd->max_depth = 1; else { rqd->max_depth = 2; ret = true; } } else { /* * scale_step == 0 is our default state. If we have suffered * latency spikes, step will be > 0, and we shrink the * allowed write depths. If step is < 0, we're only doing * writes, and we allow a temporarily higher depth to * increase performance. */ depth = min_t(unsigned int, rqd->default_depth, rqd->queue_depth); if (rqd->scale_step > 0) depth = 1 + ((depth - 1) >> min(31, rqd->scale_step)); else if (rqd->scale_step < 0) { unsigned int maxd = 3 * rqd->queue_depth / 4; depth = 1 + ((depth - 1) << -rqd->scale_step); if (depth > maxd) { depth = maxd; ret = true; } } rqd->max_depth = depth; } return ret; } /* Returns true on success and false if scaling up wasn't possible */ bool rq_depth_scale_up(struct rq_depth *rqd) { /* * Hit max in previous round, stop here */ if (rqd->scaled_max) return false; rqd->scale_step--; rqd->scaled_max = rq_depth_calc_max_depth(rqd); return true; } /* * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we * had a latency violation. Returns true on success and returns false if * scaling down wasn't possible. */ bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) { /* * Stop scaling down when we've hit the limit. This also prevents * ->scale_step from going to crazy values, if the device can't * keep up. */ if (rqd->max_depth == 1) return false; if (rqd->scale_step < 0 && hard_throttle) rqd->scale_step = 0; else rqd->scale_step++; rqd->scaled_max = false; rq_depth_calc_max_depth(rqd); return true; } struct rq_qos_wait_data { struct wait_queue_entry wq; struct rq_wait *rqw; acquire_inflight_cb_t *cb; void *private_data; bool got_token; }; static int rq_qos_wake_function(struct wait_queue_entry *curr, unsigned int mode, int wake_flags, void *key) { struct rq_qos_wait_data *data = container_of(curr, struct rq_qos_wait_data, wq); /* * If we fail to get a budget, return -1 to interrupt the wake up loop * in __wake_up_common. */ if (!data->cb(data->rqw, data->private_data)) return -1; data->got_token = true; /* * autoremove_wake_function() removes the wait entry only when it * actually changed the task state. We want the wait always removed. * Remove explicitly and use default_wake_function(). */ default_wake_function(curr, mode, wake_flags, key); /* * Note that the order of operations is important as finish_wait() * tests whether @curr is removed without grabbing the lock. This * should be the last thing to do to make sure we will not have a * UAF access to @data. And the semantics of memory barrier in it * also make sure the waiter will see the latest @data->got_token * once list_empty_careful() in finish_wait() returns true. */ list_del_init_careful(&curr->entry); return 1; } /** * rq_qos_wait - throttle on a rqw if we need to * @rqw: rqw to throttle on * @private_data: caller provided specific data * @acquire_inflight_cb: inc the rqw->inflight counter if we can * @cleanup_cb: the callback to cleanup in case we race with a waker * * This provides a uniform place for the rq_qos users to do their throttling. * Since you can end up with a lot of things sleeping at once, this manages the * waking up based on the resources available. The acquire_inflight_cb should * inc the rqw->inflight if we have the ability to do so, or return false if not * and then we will sleep until the room becomes available. * * cleanup_cb is in case that we race with a waker and need to cleanup the * inflight count accordingly. */ void rq_qos_wait(struct rq_wait *rqw, void *private_data, acquire_inflight_cb_t *acquire_inflight_cb, cleanup_cb_t *cleanup_cb) { struct rq_qos_wait_data data = { .rqw = rqw, .cb = acquire_inflight_cb, .private_data = private_data, .got_token = false, }; bool first_waiter; /* * If there are no waiters in the waiting queue, try to increase the * inflight counter if we can. Otherwise, prepare for adding ourselves * to the waiting queue. */ if (!waitqueue_active(&rqw->wait) && acquire_inflight_cb(rqw, private_data)) return; init_wait_func(&data.wq, rq_qos_wake_function); first_waiter = prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); /* * Make sure there is at least one inflight process; otherwise, waiters * will never be woken up. Since there may be no inflight process before * adding ourselves to the waiting queue above, we need to try to * increase the inflight counter for ourselves. And it is sufficient to * guarantee that at least the first waiter to enter the waiting queue * will re-check the waiting condition before going to sleep, thus * ensuring forward progress. */ if (!data.got_token && first_waiter && acquire_inflight_cb(rqw, private_data)) { finish_wait(&rqw->wait, &data.wq); /* * We raced with rq_qos_wake_function() getting a token, * which means we now have two. Put our local token * and wake anyone else potentially waiting for one. * * Enough memory barrier in list_empty_careful() in * finish_wait() is paired with list_del_init_careful() * in rq_qos_wake_function() to make sure we will see * the latest @data->got_token. */ if (data.got_token) cleanup_cb(rqw, private_data); return; } /* we are now relying on the waker to increase our inflight counter. */ do { if (data.got_token) break; io_schedule(); set_current_state(TASK_UNINTERRUPTIBLE); } while (1); finish_wait(&rqw->wait, &data.wq); } void rq_qos_exit(struct request_queue *q) { mutex_lock(&q->rq_qos_mutex); while (q->rq_qos) { struct rq_qos *rqos = q->rq_qos; q->rq_qos = rqos->next; rqos->ops->exit(rqos); } blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q); mutex_unlock(&q->rq_qos_mutex); } int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, const struct rq_qos_ops *ops) { struct request_queue *q = disk->queue; unsigned int memflags; lockdep_assert_held(&q->rq_qos_mutex); rqos->disk = disk; rqos->id = id; rqos->ops = ops; /* * No IO can be in-flight when adding rqos, so freeze queue, which * is fine since we only support rq_qos for blk-mq queue. */ memflags = blk_mq_freeze_queue(q); if (rq_qos_id(q, rqos->id)) goto ebusy; rqos->next = q->rq_qos; q->rq_qos = rqos; blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q); blk_mq_unfreeze_queue(q, memflags); if (rqos->ops->debugfs_attrs) { mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_rqos(rqos); mutex_unlock(&q->debugfs_mutex); } return 0; ebusy: blk_mq_unfreeze_queue(q, memflags); return -EBUSY; } void rq_qos_del(struct rq_qos *rqos) { struct request_queue *q = rqos->disk->queue; struct rq_qos **cur; unsigned int memflags; lockdep_assert_held(&q->rq_qos_mutex); memflags = blk_mq_freeze_queue(q); for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { if (*cur == rqos) { *cur = rqos->next; break; } } if (!q->rq_qos) blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q); blk_mq_unfreeze_queue(q, memflags); mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_rqos(rqos); mutex_unlock(&q->debugfs_mutex); }
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PATH_H #define _LINUX_PATH_H struct dentry; struct vfsmount; struct path { struct vfsmount *mnt; struct dentry *dentry; } __randomize_layout; extern void path_get(const struct path *); extern void path_put(const struct path *); static inline int path_equal(const struct path *path1, const struct path *path2) { return path1->mnt == path2->mnt && path1->dentry == path2->dentry; } /* * Cleanup macro for use with __free(path_put). Avoids dereference and * copying @path unlike DEFINE_FREE(). path_put() will handle the empty * path correctly just ensure @path is initialized: * * struct path path __free(path_put) = {}; */ #define __free_path_put path_put #endif /* _LINUX_PATH_H */
14 14 14 14 14 14 46 47 47 1 1 1 1 47 47 15 47 14 14 14 14 14 14 14 47 47 47 47 47 47 47 47 47 47 1 1 1 1 1 10 9 10 10 15 15 15 15 15 15 15 15 14 14 14 14 14 1 1 1 1 1 1 15 1 15 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Timer * Copyright (c) 1998-1999 by Frank van de Pol <fvdpol@coil.demon.nl> * Jaroslav Kysela <perex@perex.cz> */ #include <sound/core.h> #include <linux/slab.h> #include "seq_timer.h" #include "seq_queue.h" #include "seq_info.h" /* allowed sequencer timer frequencies, in Hz */ #define MIN_FREQUENCY 10 #define MAX_FREQUENCY 6250 #define DEFAULT_FREQUENCY 1000 #define SKEW_BASE 0x10000 /* 16bit shift */ static void snd_seq_timer_set_tick_resolution(struct snd_seq_timer *tmr) { unsigned int threshold = tmr->tempo_base == 1000 ? 1000000 : 10000; if (tmr->tempo < threshold) tmr->tick.resolution = (tmr->tempo * tmr->tempo_base) / tmr->ppq; else { /* might overflow.. */ unsigned int s; s = tmr->tempo % tmr->ppq; s = (s * tmr->tempo_base) / tmr->ppq; tmr->tick.resolution = (tmr->tempo / tmr->ppq) * tmr->tempo_base; tmr->tick.resolution += s; } if (tmr->tick.resolution <= 0) tmr->tick.resolution = 1; snd_seq_timer_update_tick(&tmr->tick, 0); } /* create new timer (constructor) */ struct snd_seq_timer *snd_seq_timer_new(void) { struct snd_seq_timer *tmr; tmr = kzalloc(sizeof(*tmr), GFP_KERNEL); if (!tmr) return NULL; spin_lock_init(&tmr->lock); /* reset setup to defaults */ snd_seq_timer_defaults(tmr); /* reset time */ snd_seq_timer_reset(tmr); return tmr; } /* delete timer (destructor) */ void snd_seq_timer_delete(struct snd_seq_timer **tmr) { struct snd_seq_timer *t = *tmr; *tmr = NULL; if (t == NULL) { pr_debug("ALSA: seq: snd_seq_timer_delete() called with NULL timer\n"); return; } t->running = 0; /* reset time */ snd_seq_timer_stop(t); snd_seq_timer_reset(t); kfree(t); } void snd_seq_timer_defaults(struct snd_seq_timer * tmr) { guard(spinlock_irqsave)(&tmr->lock); /* setup defaults */ tmr->ppq = 96; /* 96 PPQ */ tmr->tempo = 500000; /* 120 BPM */ tmr->tempo_base = 1000; /* 1us */ snd_seq_timer_set_tick_resolution(tmr); tmr->running = 0; tmr->type = SNDRV_SEQ_TIMER_ALSA; tmr->alsa_id.dev_class = seq_default_timer_class; tmr->alsa_id.dev_sclass = seq_default_timer_sclass; tmr->alsa_id.card = seq_default_timer_card; tmr->alsa_id.device = seq_default_timer_device; tmr->alsa_id.subdevice = seq_default_timer_subdevice; tmr->preferred_resolution = seq_default_timer_resolution; tmr->skew = tmr->skew_base = SKEW_BASE; } static void seq_timer_reset(struct snd_seq_timer *tmr) { /* reset time & songposition */ tmr->cur_time.tv_sec = 0; tmr->cur_time.tv_nsec = 0; tmr->tick.cur_tick = 0; tmr->tick.fraction = 0; } void snd_seq_timer_reset(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); seq_timer_reset(tmr); } /* called by timer interrupt routine. the period time since previous invocation is passed */ static void snd_seq_timer_interrupt(struct snd_timer_instance *timeri, unsigned long resolution, unsigned long ticks) { struct snd_seq_queue *q = timeri->callback_data; struct snd_seq_timer *tmr; if (q == NULL) return; tmr = q->timer; if (tmr == NULL) return; scoped_guard(spinlock_irqsave, &tmr->lock) { if (!tmr->running) return; resolution *= ticks; if (tmr->skew != tmr->skew_base) { /* FIXME: assuming skew_base = 0x10000 */ resolution = (resolution >> 16) * tmr->skew + (((resolution & 0xffff) * tmr->skew) >> 16); } /* update timer */ snd_seq_inc_time_nsec(&tmr->cur_time, resolution); /* calculate current tick */ snd_seq_timer_update_tick(&tmr->tick, resolution); /* register actual time of this timer update */ ktime_get_ts64(&tmr->last_update); } /* check queues and dispatch events */ snd_seq_check_queue(q, 1, 0); } /* set current tempo */ int snd_seq_timer_set_tempo(struct snd_seq_timer * tmr, int tempo) { if (snd_BUG_ON(!tmr)) return -EINVAL; if (tempo <= 0) return -EINVAL; guard(spinlock_irqsave)(&tmr->lock); if ((unsigned int)tempo != tmr->tempo) { tmr->tempo = tempo; snd_seq_timer_set_tick_resolution(tmr); } return 0; } /* set current tempo, ppq and base in a shot */ int snd_seq_timer_set_tempo_ppq(struct snd_seq_timer *tmr, int tempo, int ppq, unsigned int tempo_base) { int changed; if (snd_BUG_ON(!tmr)) return -EINVAL; if (tempo <= 0 || ppq <= 0) return -EINVAL; /* allow only 10ns or 1us tempo base for now */ if (tempo_base && tempo_base != 10 && tempo_base != 1000) return -EINVAL; guard(spinlock_irqsave)(&tmr->lock); if (tmr->running && (ppq != tmr->ppq)) { /* refuse to change ppq on running timers */ /* because it will upset the song position (ticks) */ pr_debug("ALSA: seq: cannot change ppq of a running timer\n"); return -EBUSY; } changed = (tempo != tmr->tempo) || (ppq != tmr->ppq); tmr->tempo = tempo; tmr->ppq = ppq; tmr->tempo_base = tempo_base ? tempo_base : 1000; if (changed) snd_seq_timer_set_tick_resolution(tmr); return 0; } /* set current tick position */ int snd_seq_timer_set_position_tick(struct snd_seq_timer *tmr, snd_seq_tick_time_t position) { if (snd_BUG_ON(!tmr)) return -EINVAL; guard(spinlock_irqsave)(&tmr->lock); tmr->tick.cur_tick = position; tmr->tick.fraction = 0; return 0; } /* set current real-time position */ int snd_seq_timer_set_position_time(struct snd_seq_timer *tmr, snd_seq_real_time_t position) { if (snd_BUG_ON(!tmr)) return -EINVAL; snd_seq_sanity_real_time(&position); guard(spinlock_irqsave)(&tmr->lock); tmr->cur_time = position; return 0; } /* set timer skew */ int snd_seq_timer_set_skew(struct snd_seq_timer *tmr, unsigned int skew, unsigned int base) { if (snd_BUG_ON(!tmr)) return -EINVAL; /* FIXME */ if (base != SKEW_BASE) { pr_debug("ALSA: seq: invalid skew base 0x%x\n", base); return -EINVAL; } guard(spinlock_irqsave)(&tmr->lock); tmr->skew = skew; return 0; } int snd_seq_timer_open(struct snd_seq_queue *q) { struct snd_timer_instance *t; struct snd_seq_timer *tmr; char str[32]; int err; tmr = q->timer; if (snd_BUG_ON(!tmr)) return -EINVAL; if (tmr->timeri) return -EBUSY; sprintf(str, "sequencer queue %i", q->queue); if (tmr->type != SNDRV_SEQ_TIMER_ALSA) /* standard ALSA timer */ return -EINVAL; if (tmr->alsa_id.dev_class != SNDRV_TIMER_CLASS_SLAVE) tmr->alsa_id.dev_sclass = SNDRV_TIMER_SCLASS_SEQUENCER; t = snd_timer_instance_new(str); if (!t) return -ENOMEM; t->callback = snd_seq_timer_interrupt; t->callback_data = q; t->flags |= SNDRV_TIMER_IFLG_AUTO; err = snd_timer_open(t, &tmr->alsa_id, q->queue); if (err < 0 && tmr->alsa_id.dev_class != SNDRV_TIMER_CLASS_SLAVE) { if (tmr->alsa_id.dev_class != SNDRV_TIMER_CLASS_GLOBAL || tmr->alsa_id.device != SNDRV_TIMER_GLOBAL_SYSTEM) { struct snd_timer_id tid; memset(&tid, 0, sizeof(tid)); tid.dev_class = SNDRV_TIMER_CLASS_GLOBAL; tid.dev_sclass = SNDRV_TIMER_SCLASS_SEQUENCER; tid.card = -1; tid.device = SNDRV_TIMER_GLOBAL_SYSTEM; err = snd_timer_open(t, &tid, q->queue); } } if (err < 0) { pr_err("ALSA: seq fatal error: cannot create timer (%i)\n", err); snd_timer_instance_free(t); return err; } scoped_guard(spinlock_irq, &tmr->lock) { if (tmr->timeri) err = -EBUSY; else tmr->timeri = t; } if (err < 0) { snd_timer_close(t); snd_timer_instance_free(t); return err; } return 0; } int snd_seq_timer_close(struct snd_seq_queue *q) { struct snd_seq_timer *tmr; struct snd_timer_instance *t; tmr = q->timer; if (snd_BUG_ON(!tmr)) return -EINVAL; scoped_guard(spinlock_irq, &tmr->lock) { t = tmr->timeri; tmr->timeri = NULL; } if (t) { snd_timer_close(t); snd_timer_instance_free(t); } return 0; } static int seq_timer_stop(struct snd_seq_timer *tmr) { if (! tmr->timeri) return -EINVAL; if (!tmr->running) return 0; tmr->running = 0; snd_timer_pause(tmr->timeri); return 0; } int snd_seq_timer_stop(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return seq_timer_stop(tmr); } static int initialize_timer(struct snd_seq_timer *tmr) { struct snd_timer *t; unsigned long freq; t = tmr->timeri->timer; if (!t) return -EINVAL; freq = tmr->preferred_resolution; if (!freq) freq = DEFAULT_FREQUENCY; else if (freq < MIN_FREQUENCY) freq = MIN_FREQUENCY; else if (freq > MAX_FREQUENCY) freq = MAX_FREQUENCY; tmr->ticks = 1; if (!(t->hw.flags & SNDRV_TIMER_HW_SLAVE)) { unsigned long r = snd_timer_resolution(tmr->timeri); if (r) { tmr->ticks = (unsigned int)(1000000000uL / (r * freq)); if (! tmr->ticks) tmr->ticks = 1; } } tmr->initialized = 1; return 0; } static int seq_timer_start(struct snd_seq_timer *tmr) { if (! tmr->timeri) return -EINVAL; if (tmr->running) seq_timer_stop(tmr); seq_timer_reset(tmr); if (initialize_timer(tmr) < 0) return -EINVAL; snd_timer_start(tmr->timeri, tmr->ticks); tmr->running = 1; ktime_get_ts64(&tmr->last_update); return 0; } int snd_seq_timer_start(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return seq_timer_start(tmr); } static int seq_timer_continue(struct snd_seq_timer *tmr) { if (! tmr->timeri) return -EINVAL; if (tmr->running) return -EBUSY; if (! tmr->initialized) { seq_timer_reset(tmr); if (initialize_timer(tmr) < 0) return -EINVAL; } snd_timer_start(tmr->timeri, tmr->ticks); tmr->running = 1; ktime_get_ts64(&tmr->last_update); return 0; } int snd_seq_timer_continue(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return seq_timer_continue(tmr); } /* return current 'real' time. use timeofday() to get better granularity. */ snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr, bool adjust_ktime) { snd_seq_real_time_t cur_time; guard(spinlock_irqsave)(&tmr->lock); cur_time = tmr->cur_time; if (adjust_ktime && tmr->running) { struct timespec64 tm; ktime_get_ts64(&tm); tm = timespec64_sub(tm, tmr->last_update); cur_time.tv_nsec += tm.tv_nsec; cur_time.tv_sec += tm.tv_sec; snd_seq_sanity_real_time(&cur_time); } return cur_time; } /* TODO: use interpolation on tick queue (will only be useful for very high PPQ values) */ snd_seq_tick_time_t snd_seq_timer_get_cur_tick(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return tmr->tick.cur_tick; } #ifdef CONFIG_SND_PROC_FS /* exported to seq_info.c */ void snd_seq_info_timer_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { int idx; struct snd_seq_timer *tmr; struct snd_timer_instance *ti; unsigned long resolution; for (idx = 0; idx < SNDRV_SEQ_MAX_QUEUES; idx++) { struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(idx); if (q == NULL) continue; scoped_guard(mutex, &q->timer_mutex) { tmr = q->timer; if (!tmr) break; ti = tmr->timeri; if (!ti) break; snd_iprintf(buffer, "Timer for queue %i : %s\n", q->queue, ti->timer->name); resolution = snd_timer_resolution(ti) * tmr->ticks; snd_iprintf(buffer, " Period time : %lu.%09lu\n", resolution / 1000000000, resolution % 1000000000); snd_iprintf(buffer, " Skew : %u / %u\n", tmr->skew, tmr->skew_base); } } } #endif /* CONFIG_SND_PROC_FS */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 // SPDX-License-Identifier: GPL-2.0-only /* * net/psample/psample.c - Netlink channel for packet sampling * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/module.h> #include <linux/timekeeping.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/psample.h> #include <linux/spinlock.h> #include <net/ip_tunnels.h> #include <net/dst_metadata.h> #define PSAMPLE_MAX_PACKET_SIZE 0xffff static LIST_HEAD(psample_groups_list); static DEFINE_SPINLOCK(psample_groups_lock); /* multicast groups */ enum psample_nl_multicast_groups { PSAMPLE_NL_MCGRP_CONFIG, PSAMPLE_NL_MCGRP_SAMPLE, }; static const struct genl_multicast_group psample_nl_mcgrps[] = { [PSAMPLE_NL_MCGRP_CONFIG] = { .name = PSAMPLE_NL_MCGRP_CONFIG_NAME }, [PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME, .flags = GENL_MCAST_CAP_NET_ADMIN, }, }; static struct genl_family psample_nl_family __ro_after_init; static int psample_group_nl_fill(struct sk_buff *msg, struct psample_group *group, enum psample_command cmd, u32 portid, u32 seq, int flags) { void *hdr; int ret; hdr = genlmsg_put(msg, portid, seq, &psample_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; ret = nla_put_u32(msg, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num); if (ret < 0) goto error; ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_REFCOUNT, group->refcount); if (ret < 0) goto error; ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_SEQ, group->seq); if (ret < 0) goto error; genlmsg_end(msg, hdr); return 0; error: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct psample_group *group; int start = cb->args[0]; int idx = 0; int err; spin_lock_bh(&psample_groups_lock); list_for_each_entry(group, &psample_groups_list, list) { if (!net_eq(group->net, sock_net(msg->sk))) continue; if (idx < start) { idx++; continue; } err = psample_group_nl_fill(msg, group, PSAMPLE_CMD_NEW_GROUP, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) break; idx++; } spin_unlock_bh(&psample_groups_lock); cb->args[0] = idx; return msg->len; } static const struct genl_small_ops psample_nl_ops[] = { { .cmd = PSAMPLE_CMD_GET_GROUP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = psample_nl_cmd_get_group_dumpit, /* can be retrieved by unprivileged users */ } }; static struct genl_family psample_nl_family __ro_after_init = { .name = PSAMPLE_GENL_NAME, .version = PSAMPLE_GENL_VERSION, .maxattr = PSAMPLE_ATTR_MAX, .netnsok = true, .module = THIS_MODULE, .mcgrps = psample_nl_mcgrps, .small_ops = psample_nl_ops, .n_small_ops = ARRAY_SIZE(psample_nl_ops), .resv_start_op = PSAMPLE_CMD_GET_GROUP + 1, .n_mcgrps = ARRAY_SIZE(psample_nl_mcgrps), }; static void psample_group_notify(struct psample_group *group, enum psample_command cmd) { struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return; err = psample_group_nl_fill(msg, group, cmd, 0, 0, NLM_F_MULTI); if (!err) genlmsg_multicast_netns(&psample_nl_family, group->net, msg, 0, PSAMPLE_NL_MCGRP_CONFIG, GFP_ATOMIC); else nlmsg_free(msg); } static struct psample_group *psample_group_create(struct net *net, u32 group_num) { struct psample_group *group; group = kzalloc(sizeof(*group), GFP_ATOMIC); if (!group) return NULL; group->net = net; group->group_num = group_num; list_add_tail(&group->list, &psample_groups_list); psample_group_notify(group, PSAMPLE_CMD_NEW_GROUP); return group; } static void psample_group_destroy(struct psample_group *group) { psample_group_notify(group, PSAMPLE_CMD_DEL_GROUP); list_del(&group->list); kfree_rcu(group, rcu); } static struct psample_group * psample_group_lookup(struct net *net, u32 group_num) { struct psample_group *group; list_for_each_entry(group, &psample_groups_list, list) if ((group->group_num == group_num) && (group->net == net)) return group; return NULL; } struct psample_group *psample_group_get(struct net *net, u32 group_num) { struct psample_group *group; spin_lock_bh(&psample_groups_lock); group = psample_group_lookup(net, group_num); if (!group) { group = psample_group_create(net, group_num); if (!group) goto out; } group->refcount++; out: spin_unlock_bh(&psample_groups_lock); return group; } EXPORT_SYMBOL_GPL(psample_group_get); void psample_group_take(struct psample_group *group) { spin_lock_bh(&psample_groups_lock); group->refcount++; spin_unlock_bh(&psample_groups_lock); } EXPORT_SYMBOL_GPL(psample_group_take); void psample_group_put(struct psample_group *group) { spin_lock_bh(&psample_groups_lock); if (--group->refcount == 0) psample_group_destroy(group); spin_unlock_bh(&psample_groups_lock); } EXPORT_SYMBOL_GPL(psample_group_put); #ifdef CONFIG_INET static int __psample_ip_tun_to_nlattr(struct sk_buff *skb, struct ip_tunnel_info *tun_info) { unsigned short tun_proto = ip_tunnel_info_af(tun_info); const void *tun_opts = ip_tunnel_info_opts(tun_info); const struct ip_tunnel_key *tun_key = &tun_info->key; int tun_opts_len = tun_info->options_len; if (test_bit(IP_TUNNEL_KEY_BIT, tun_key->tun_flags) && nla_put_be64(skb, PSAMPLE_TUNNEL_KEY_ATTR_ID, tun_key->tun_id, PSAMPLE_TUNNEL_KEY_ATTR_PAD)) return -EMSGSIZE; if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE)) return -EMSGSIZE; switch (tun_proto) { case AF_INET: if (tun_key->u.ipv4.src && nla_put_in_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_SRC, tun_key->u.ipv4.src)) return -EMSGSIZE; if (tun_key->u.ipv4.dst && nla_put_in_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_DST, tun_key->u.ipv4.dst)) return -EMSGSIZE; break; case AF_INET6: if (!ipv6_addr_any(&tun_key->u.ipv6.src) && nla_put_in6_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_SRC, &tun_key->u.ipv6.src)) return -EMSGSIZE; if (!ipv6_addr_any(&tun_key->u.ipv6.dst) && nla_put_in6_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_DST, &tun_key->u.ipv6.dst)) return -EMSGSIZE; break; } if (tun_key->tos && nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TOS, tun_key->tos)) return -EMSGSIZE; if (nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TTL, tun_key->ttl)) return -EMSGSIZE; if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_key->tun_flags) && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) return -EMSGSIZE; if (test_bit(IP_TUNNEL_CSUM_BIT, tun_key->tun_flags) && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_CSUM)) return -EMSGSIZE; if (tun_key->tp_src && nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_SRC, tun_key->tp_src)) return -EMSGSIZE; if (tun_key->tp_dst && nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_DST, tun_key->tp_dst)) return -EMSGSIZE; if (test_bit(IP_TUNNEL_OAM_BIT, tun_key->tun_flags) && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; if (tun_opts_len) { if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_key->tun_flags) && nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS, tun_opts_len, tun_opts)) return -EMSGSIZE; else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_key->tun_flags) && nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS, tun_opts_len, tun_opts)) return -EMSGSIZE; } return 0; } static int psample_ip_tun_to_nlattr(struct sk_buff *skb, struct ip_tunnel_info *tun_info) { struct nlattr *nla; int err; nla = nla_nest_start_noflag(skb, PSAMPLE_ATTR_TUNNEL); if (!nla) return -EMSGSIZE; err = __psample_ip_tun_to_nlattr(skb, tun_info); if (err) { nla_nest_cancel(skb, nla); return err; } nla_nest_end(skb, nla); return 0; } static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) { unsigned short tun_proto = ip_tunnel_info_af(tun_info); const struct ip_tunnel_key *tun_key = &tun_info->key; int tun_opts_len = tun_info->options_len; int sum = nla_total_size(0); /* PSAMPLE_ATTR_TUNNEL */ if (test_bit(IP_TUNNEL_KEY_BIT, tun_key->tun_flags)) sum += nla_total_size_64bit(sizeof(u64)); if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE) sum += nla_total_size(0); switch (tun_proto) { case AF_INET: if (tun_key->u.ipv4.src) sum += nla_total_size(sizeof(u32)); if (tun_key->u.ipv4.dst) sum += nla_total_size(sizeof(u32)); break; case AF_INET6: if (!ipv6_addr_any(&tun_key->u.ipv6.src)) sum += nla_total_size(sizeof(struct in6_addr)); if (!ipv6_addr_any(&tun_key->u.ipv6.dst)) sum += nla_total_size(sizeof(struct in6_addr)); break; } if (tun_key->tos) sum += nla_total_size(sizeof(u8)); sum += nla_total_size(sizeof(u8)); /* TTL */ if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_key->tun_flags)) sum += nla_total_size(0); if (test_bit(IP_TUNNEL_CSUM_BIT, tun_key->tun_flags)) sum += nla_total_size(0); if (tun_key->tp_src) sum += nla_total_size(sizeof(u16)); if (tun_key->tp_dst) sum += nla_total_size(sizeof(u16)); if (test_bit(IP_TUNNEL_OAM_BIT, tun_key->tun_flags)) sum += nla_total_size(0); if (tun_opts_len) { if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_key->tun_flags)) sum += nla_total_size(tun_opts_len); else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_key->tun_flags)) sum += nla_total_size(tun_opts_len); } return sum; } #endif void psample_sample_packet(struct psample_group *group, const struct sk_buff *skb, u32 sample_rate, const struct psample_metadata *md) { ktime_t tstamp = ktime_get_real(); int out_ifindex = md->out_ifindex; int in_ifindex = md->in_ifindex; u32 trunc_size = md->trunc_size; #ifdef CONFIG_INET struct ip_tunnel_info *tun_info; #endif struct sk_buff *nl_skb; int data_len; int meta_len; void *data; int ret; if (!genl_has_listeners(&psample_nl_family, group->net, PSAMPLE_NL_MCGRP_SAMPLE)) return; meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) + (out_ifindex ? nla_total_size(sizeof(u16)) : 0) + (md->out_tc_valid ? nla_total_size(sizeof(u16)) : 0) + (md->out_tc_occ_valid ? nla_total_size_64bit(sizeof(u64)) : 0) + (md->latency_valid ? nla_total_size_64bit(sizeof(u64)) : 0) + nla_total_size(sizeof(u32)) + /* sample_rate */ nla_total_size(sizeof(u32)) + /* orig_size */ nla_total_size(sizeof(u32)) + /* group_num */ nla_total_size(sizeof(u32)) + /* seq */ nla_total_size_64bit(sizeof(u64)) + /* timestamp */ nla_total_size(sizeof(u16)) + /* protocol */ (md->user_cookie_len ? nla_total_size(md->user_cookie_len) : 0) + /* user cookie */ (md->rate_as_probability ? nla_total_size(0) : 0); /* rate as probability */ #ifdef CONFIG_INET tun_info = skb_tunnel_info(skb); if (tun_info) meta_len += psample_tunnel_meta_len(tun_info); #endif data_len = min(skb->len, trunc_size); if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE) data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN - NLA_ALIGNTO; nl_skb = genlmsg_new(meta_len + nla_total_size(data_len), GFP_ATOMIC); if (unlikely(!nl_skb)) return; data = genlmsg_put(nl_skb, 0, 0, &psample_nl_family, 0, PSAMPLE_CMD_SAMPLE); if (unlikely(!data)) goto error; if (in_ifindex) { ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_IIFINDEX, in_ifindex); if (unlikely(ret < 0)) goto error; } if (out_ifindex) { ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OIFINDEX, out_ifindex); if (unlikely(ret < 0)) goto error; } ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_RATE, sample_rate); if (unlikely(ret < 0)) goto error; ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_ORIGSIZE, skb->len); if (unlikely(ret < 0)) goto error; ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num); if (unlikely(ret < 0)) goto error; ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_GROUP_SEQ, group->seq++); if (unlikely(ret < 0)) goto error; if (md->out_tc_valid) { ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OUT_TC, md->out_tc); if (unlikely(ret < 0)) goto error; } if (md->out_tc_occ_valid) { ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_OUT_TC_OCC, md->out_tc_occ, PSAMPLE_ATTR_PAD); if (unlikely(ret < 0)) goto error; } if (md->latency_valid) { ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_LATENCY, md->latency, PSAMPLE_ATTR_PAD); if (unlikely(ret < 0)) goto error; } ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_TIMESTAMP, ktime_to_ns(tstamp), PSAMPLE_ATTR_PAD); if (unlikely(ret < 0)) goto error; ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_PROTO, be16_to_cpu(skb->protocol)); if (unlikely(ret < 0)) goto error; if (data_len) { int nla_len = nla_total_size(data_len); struct nlattr *nla; nla = skb_put(nl_skb, nla_len); nla->nla_type = PSAMPLE_ATTR_DATA; nla->nla_len = nla_attr_size(data_len); if (skb_copy_bits(skb, 0, nla_data(nla), data_len)) goto error; } #ifdef CONFIG_INET if (tun_info) { ret = psample_ip_tun_to_nlattr(nl_skb, tun_info); if (unlikely(ret < 0)) goto error; } #endif if (md->user_cookie && md->user_cookie_len && nla_put(nl_skb, PSAMPLE_ATTR_USER_COOKIE, md->user_cookie_len, md->user_cookie)) goto error; if (md->rate_as_probability && nla_put_flag(nl_skb, PSAMPLE_ATTR_SAMPLE_PROBABILITY)) goto error; genlmsg_end(nl_skb, data); genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0, PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC); return; error: pr_err_ratelimited("Could not create psample log message\n"); nlmsg_free(nl_skb); } EXPORT_SYMBOL_GPL(psample_sample_packet); static int __init psample_module_init(void) { return genl_register_family(&psample_nl_family); } static void __exit psample_module_exit(void) { genl_unregister_family(&psample_nl_family); } module_init(psample_module_init); module_exit(psample_module_exit); MODULE_AUTHOR("Yotam Gigi <yotam.gi@gmail.com>"); MODULE_DESCRIPTION("netlink channel for packet sampling"); MODULE_LICENSE("GPL v2");
187 190 187 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012 Regents of the University of California * Copyright (C) 2014 Darius Rad <darius@bluespec.com> * Copyright (C) 2017 SiFive */ #include <linux/syscalls.h> #include <asm/cacheflush.h> static long riscv_sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset, unsigned long page_shift_offset) { if (unlikely(offset & (~PAGE_MASK >> page_shift_offset))) return -EINVAL; return ksys_mmap_pgoff(addr, len, prot, flags, fd, offset >> (PAGE_SHIFT - page_shift_offset)); } #ifdef CONFIG_64BIT SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, offset) { return riscv_sys_mmap(addr, len, prot, flags, fd, offset, 0); } #endif #if defined(CONFIG_32BIT) || defined(CONFIG_COMPAT) SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, offset) { /* * Note that the shift for mmap2 is constant (12), * regardless of PAGE_SIZE */ return riscv_sys_mmap(addr, len, prot, flags, fd, offset, 12); } #endif /* * Allows the instruction cache to be flushed from userspace. Despite RISC-V * having a direct 'fence.i' instruction available to userspace (which we * can't trap!), that's not actually viable when running on Linux because the * kernel might schedule a process on another hart. There is no way for * userspace to handle this without invoking the kernel (as it doesn't know the * thread->hart mappings), so we've defined a RISC-V specific system call to * flush the instruction cache. * * sys_riscv_flush_icache() is defined to flush the instruction cache over an * address range, with the flush applying to either all threads or just the * caller. We don't currently do anything with the address range, that's just * in there for forwards compatibility. */ SYSCALL_DEFINE3(riscv_flush_icache, uintptr_t, start, uintptr_t, end, uintptr_t, flags) { /* Check the reserved flags. */ if (unlikely(flags & ~SYS_RISCV_FLUSH_ICACHE_ALL)) return -EINVAL; flush_icache_mm(current->mm, flags & SYS_RISCV_FLUSH_ICACHE_LOCAL); return 0; } /* Not defined using SYSCALL_DEFINE0 to avoid error injection */ asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *__unused) { return -ENOSYS; }
87 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_GETORDER_H #define __ASM_GENERIC_GETORDER_H #ifndef __ASSEMBLY__ #include <linux/compiler.h> #include <linux/log2.h> /** * get_order - Determine the allocation order of a memory size * @size: The size for which to get the order * * Determine the allocation order of a particular sized block of memory. This * is on a logarithmic scale, where: * * 0 -> 2^0 * PAGE_SIZE and below * 1 -> 2^1 * PAGE_SIZE to 2^0 * PAGE_SIZE + 1 * 2 -> 2^2 * PAGE_SIZE to 2^1 * PAGE_SIZE + 1 * 3 -> 2^3 * PAGE_SIZE to 2^2 * PAGE_SIZE + 1 * 4 -> 2^4 * PAGE_SIZE to 2^3 * PAGE_SIZE + 1 * ... * * The order returned is used to find the smallest allocation granule required * to hold an object of the specified size. * * The result is undefined if the size is 0. */ static __always_inline __attribute_const__ int get_order(unsigned long size) { if (__builtin_constant_p(size)) { if (!size) return BITS_PER_LONG - PAGE_SHIFT; if (size < (1UL << PAGE_SHIFT)) return 0; return ilog2((size) - 1) - PAGE_SHIFT + 1; } size--; size >>= PAGE_SHIFT; #if BITS_PER_LONG == 32 return fls(size); #else return fls64(size); #endif } #endif /* __ASSEMBLY__ */ #endif /* __ASM_GENERIC_GETORDER_H */
1 1 1 1 1 1 8 6 5 1 1 3 2 1 3 27 26 6 5 5 5 5 2 15 14 14 14 14 14 13 4 4 4 4 4 9 26 2 11 11 11 3 2 2 2 2 1 3 2 1 1 2 2 2 8 1 1 4 3 1 3 1 2 1 1 2 1 3 218 7 7 219 9 9 8 7 7 7 7 7 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 // SPDX-License-Identifier: GPL-2.0 /* XDP sockets * * AF_XDP sockets allows a channel between XDP programs and userspace * applications. * Copyright(c) 2018 Intel Corporation. * * Author(s): Björn Töpel <bjorn.topel@intel.com> * Magnus Karlsson <magnus.karlsson@intel.com> */ #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ #include <linux/if_xdp.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/socket.h> #include <linux/file.h> #include <linux/uaccess.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/rculist.h> #include <linux/vmalloc.h> #include <net/xdp_sock_drv.h> #include <net/busy_poll.h> #include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/xdp.h> #include "xsk_queue.h" #include "xdp_umem.h" #include "xsk.h" #define TX_BATCH_SIZE 32 #define MAX_PER_SOCKET_BUDGET 32 struct xsk_addrs { u32 num_descs; u64 addrs[MAX_SKB_FRAGS + 1]; }; static struct kmem_cache *xsk_tx_generic_cache; void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { if (pool->cached_need_wakeup & XDP_WAKEUP_RX) return; pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP; pool->cached_need_wakeup |= XDP_WAKEUP_RX; } EXPORT_SYMBOL(xsk_set_rx_need_wakeup); void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) { struct xdp_sock *xs; if (pool->cached_need_wakeup & XDP_WAKEUP_TX) return; rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); pool->cached_need_wakeup |= XDP_WAKEUP_TX; } EXPORT_SYMBOL(xsk_set_tx_need_wakeup); void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) { if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX)) return; pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; pool->cached_need_wakeup &= ~XDP_WAKEUP_RX; } EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) { struct xdp_sock *xs; if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX)) return; rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); pool->cached_need_wakeup &= ~XDP_WAKEUP_TX; } EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) { return pool->uses_need_wakeup; } EXPORT_SYMBOL(xsk_uses_need_wakeup); struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id) { if (queue_id < dev->real_num_rx_queues) return dev->_rx[queue_id].pool; if (queue_id < dev->real_num_tx_queues) return dev->_tx[queue_id].pool; return NULL; } EXPORT_SYMBOL(xsk_get_pool_from_qid); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) { if (queue_id < dev->num_rx_queues) dev->_rx[queue_id].pool = NULL; if (queue_id < dev->num_tx_queues) dev->_tx[queue_id].pool = NULL; } /* The buffer pool is stored both in the _rx struct and the _tx struct as we do * not know if the device has more tx queues than rx, or the opposite. * This might also change during run time. */ int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id) { if (queue_id >= max_t(unsigned int, dev->real_num_rx_queues, dev->real_num_tx_queues)) return -EINVAL; if (queue_id < dev->real_num_rx_queues) dev->_rx[queue_id].pool = pool; if (queue_id < dev->real_num_tx_queues) dev->_tx[queue_id].pool = pool; return 0; } static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, u32 flags) { u64 addr; int err; addr = xp_get_handle(xskb, xskb->pool); err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); if (err) { xs->rx_queue_full++; return err; } xp_release(xskb); return 0; } static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); u32 frags = xdp_buff_has_frags(xdp); struct xdp_buff_xsk *pos, *tmp; struct list_head *xskb_list; u32 contd = 0; int err; if (frags) contd = XDP_PKT_CONTD; err = __xsk_rcv_zc(xs, xskb, len, contd); if (err) goto err; if (likely(!frags)) return 0; xskb_list = &xskb->pool->xskb_list; list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { if (list_is_singular(xskb_list)) contd = 0; len = pos->xdp.data_end - pos->xdp.data; err = __xsk_rcv_zc(xs, pos, len, contd); if (err) goto err; list_del(&pos->list_node); } return 0; err: xsk_buff_free(xdp); return err; } static void *xsk_copy_xdp_start(struct xdp_buff *from) { if (unlikely(xdp_data_meta_unsupported(from))) return from->data; else return from->data_meta; } static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, u32 *from_len, skb_frag_t **frag, u32 rem) { u32 copied = 0; while (1) { u32 copy_len = min_t(u32, *from_len, to_len); memcpy(to, *from, copy_len); copied += copy_len; if (rem == copied) return copied; if (*from_len == copy_len) { *from = skb_frag_address(*frag); *from_len = skb_frag_size((*frag)++); } else { *from += copy_len; *from_len -= copy_len; } if (to_len == copy_len) return copied; to_len -= copy_len; to += copy_len; } } static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; u32 from_len, meta_len, rem, num_desc; struct xdp_buff_xsk *xskb; struct xdp_buff *xsk_xdp; skb_frag_t *frag; from_len = xdp->data_end - copy_from; meta_len = xdp->data - copy_from; rem = len + meta_len; if (len <= frame_size && !xdp_buff_has_frags(xdp)) { int err; xsk_xdp = xsk_buff_alloc(xs->pool); if (!xsk_xdp) { xs->rx_dropped++; return -ENOMEM; } memcpy(xsk_xdp->data - meta_len, copy_from, rem); xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); err = __xsk_rcv_zc(xs, xskb, len, 0); if (err) { xsk_buff_free(xsk_xdp); return err; } return 0; } num_desc = (len - 1) / frame_size + 1; if (!xsk_buff_can_alloc(xs->pool, num_desc)) { xs->rx_dropped++; return -ENOMEM; } if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { xs->rx_queue_full++; return -ENOBUFS; } if (xdp_buff_has_frags(xdp)) { struct skb_shared_info *sinfo; sinfo = xdp_get_shared_info_from_buff(xdp); frag = &sinfo->frags[0]; } do { u32 to_len = frame_size + meta_len; u32 copied; xsk_xdp = xsk_buff_alloc(xs->pool); copy_to = xsk_xdp->data - meta_len; copied = xsk_copy_xdp(copy_to, &copy_from, to_len, &from_len, &frag, rem); rem -= copied; xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); meta_len = 0; } while (rem); return 0; } static bool xsk_tx_writeable(struct xdp_sock *xs) { if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) return false; return true; } static void __xsk_tx_release(struct xdp_sock *xs) { __xskq_cons_release(xs->tx); if (xsk_tx_writeable(xs)) xs->sk.sk_write_space(&xs->sk); } static bool xsk_is_bound(struct xdp_sock *xs) { if (READ_ONCE(xs->state) == XSK_BOUND) { /* Matches smp_wmb() in bind(). */ smp_rmb(); return true; } return false; } static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { if (!xsk_is_bound(xs)) return -ENXIO; if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { xs->rx_dropped++; return -ENOSPC; } return 0; } static void xsk_flush(struct xdp_sock *xs) { xskq_prod_submit(xs->rx); __xskq_cons_release(xs->pool->fq); sock_def_readable(&xs->sk); } int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 len = xdp_get_buff_len(xdp); int err; err = xsk_rcv_check(xs, xdp, len); if (!err) { spin_lock_bh(&xs->pool->rx_lock); err = __xsk_rcv(xs, xdp, len); xsk_flush(xs); spin_unlock_bh(&xs->pool->rx_lock); } return err; } static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 len = xdp_get_buff_len(xdp); int err; err = xsk_rcv_check(xs, xdp, len); if (err) return err; if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { len = xdp->data_end - xdp->data; return xsk_rcv_zc(xs, xdp, len); } err = __xsk_rcv(xs, xdp, len); if (!err) xdp_return_buff(xdp); return err; } int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { int err; err = xsk_rcv(xs, xdp); if (err) return err; if (!xs->flush_node.prev) { struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list(); list_add(&xs->flush_node, flush_list); } return 0; } void __xsk_map_flush(struct list_head *flush_list) { struct xdp_sock *xs, *tmp; list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { xsk_flush(xs); __list_del_clearprev(&xs->flush_node); } } void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) { xskq_prod_submit_n(pool->cq, nb_entries); } EXPORT_SYMBOL(xsk_tx_completed); void xsk_tx_release(struct xsk_buff_pool *pool) { struct xdp_sock *xs; rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) __xsk_tx_release(xs); rcu_read_unlock(); } EXPORT_SYMBOL(xsk_tx_release); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { bool budget_exhausted = false; struct xdp_sock *xs; rcu_read_lock(); again: list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) { budget_exhausted = true; continue; } if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { if (xskq_has_descs(xs->tx)) xskq_cons_release(xs->tx); continue; } xs->tx_budget_spent++; /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ if (xskq_prod_reserve_addr(pool->cq, desc->addr)) goto out; xskq_cons_release(xs->tx); rcu_read_unlock(); return true; } if (budget_exhausted) { list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) xs->tx_budget_spent = 0; budget_exhausted = false; goto again; } out: rcu_read_unlock(); return false; } EXPORT_SYMBOL(xsk_tx_peek_desc); static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries) { struct xdp_desc *descs = pool->tx_descs; u32 nb_pkts = 0; while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) nb_pkts++; xsk_tx_release(pool); return nb_pkts; } u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts) { struct xdp_sock *xs; rcu_read_lock(); if (!list_is_singular(&pool->xsk_tx_list)) { /* Fallback to the non-batched version */ rcu_read_unlock(); return xsk_tx_peek_release_fallback(pool, nb_pkts); } xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); if (!xs) { nb_pkts = 0; goto out; } nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts); /* This is the backpressure mechanism for the Tx path. Try to * reserve space in the completion queue for all packets, but * if there are fewer slots available, just process that many * packets. This avoids having to implement any buffering in * the Tx path. */ nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts); if (!nb_pkts) goto out; nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts); if (!nb_pkts) { xs->tx->queue_empty_descs++; goto out; } __xskq_cons_release(xs->tx); xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts); xs->sk.sk_write_space(&xs->sk); out: rcu_read_unlock(); return nb_pkts; } EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); static int xsk_wakeup(struct xdp_sock *xs, u8 flags) { struct net_device *dev = xs->dev; return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) { int ret; spin_lock(&pool->cq_cached_prod_lock); ret = xskq_prod_reserve(pool->cq); spin_unlock(&pool->cq_cached_prod_lock); return ret; } static bool xsk_skb_destructor_is_addr(struct sk_buff *skb) { return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL; } static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb) { return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL); } static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) { skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); } static void xsk_inc_num_desc(struct sk_buff *skb) { struct xsk_addrs *xsk_addr; if (!xsk_skb_destructor_is_addr(skb)) { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; xsk_addr->num_descs++; } } static u32 xsk_get_num_desc(struct sk_buff *skb) { struct xsk_addrs *xsk_addr; if (xsk_skb_destructor_is_addr(skb)) return 1; xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; return xsk_addr->num_descs; } static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, struct sk_buff *skb) { u32 num_descs = xsk_get_num_desc(skb); struct xsk_addrs *xsk_addr; u32 descs_processed = 0; unsigned long flags; u32 idx, i; spin_lock_irqsave(&pool->cq_prod_lock, flags); idx = xskq_get_prod(pool->cq); if (unlikely(num_descs > 1)) { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; for (i = 0; i < num_descs; i++) { xskq_prod_write_addr(pool->cq, idx + descs_processed, xsk_addr->addrs[i]); descs_processed++; } kmem_cache_free(xsk_tx_generic_cache, xsk_addr); } else { xskq_prod_write_addr(pool->cq, idx, xsk_skb_destructor_get_addr(skb)); descs_processed++; } xskq_prod_submit_n(pool->cq, descs_processed); spin_unlock_irqrestore(&pool->cq_prod_lock, flags); } static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) { spin_lock(&pool->cq_cached_prod_lock); xskq_prod_cancel_n(pool->cq, n); spin_unlock(&pool->cq_cached_prod_lock); } INDIRECT_CALLABLE_SCOPE void xsk_destruct_skb(struct sk_buff *skb) { struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; if (compl->tx_timestamp) { /* sw completion timestamp, not a real one */ *compl->tx_timestamp = ktime_get_tai_fast_ns(); } xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb); sock_wfree(skb); } static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, u64 addr) { skb->dev = xs->dev; skb->priority = READ_ONCE(xs->sk.sk_priority); skb->mark = READ_ONCE(xs->sk.sk_mark); skb->destructor = xsk_destruct_skb; xsk_skb_destructor_set_addr(skb, addr); } static void xsk_consume_skb(struct sk_buff *skb) { struct xdp_sock *xs = xdp_sk(skb->sk); u32 num_descs = xsk_get_num_desc(skb); struct xsk_addrs *xsk_addr; if (unlikely(num_descs > 1)) { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; kmem_cache_free(xsk_tx_generic_cache, xsk_addr); } skb->destructor = sock_wfree; xsk_cq_cancel_locked(xs->pool, num_descs); /* Free skb without triggering the perf drop trace */ consume_skb(skb); xs->skb = NULL; } static void xsk_drop_skb(struct sk_buff *skb) { xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); xsk_consume_skb(skb); } static int xsk_skb_metadata(struct sk_buff *skb, void *buffer, struct xdp_desc *desc, struct xsk_buff_pool *pool, u32 hr) { struct xsk_tx_metadata *meta = NULL; if (unlikely(pool->tx_metadata_len == 0)) return -EINVAL; meta = buffer - pool->tx_metadata_len; if (unlikely(!xsk_buff_valid_tx_metadata(meta))) return -EINVAL; if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { if (unlikely(meta->request.csum_start + meta->request.csum_offset + sizeof(__sum16) > desc->len)) return -EINVAL; skb->csum_start = hr + meta->request.csum_start; skb->csum_offset = meta->request.csum_offset; skb->ip_summed = CHECKSUM_PARTIAL; if (unlikely(pool->tx_sw_csum)) { int err; err = skb_checksum_help(skb); if (err) return err; } } if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) skb->skb_mstamp_ns = meta->request.launch_time; xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); return 0; } static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { struct xsk_buff_pool *pool = xs->pool; u32 hr, len, ts, offset, copy, copied; struct sk_buff *skb = xs->skb; struct page *page; void *buffer; int err, i; u64 addr; addr = desc->addr; buffer = xsk_buff_raw_get_data(pool, addr); if (!skb) { hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); if (unlikely(!skb)) return ERR_PTR(err); skb_reserve(skb, hr); xsk_skb_init_misc(skb, xs, desc->addr); if (desc->options & XDP_TX_METADATA) { err = xsk_skb_metadata(skb, buffer, desc, pool, hr); if (unlikely(err)) return ERR_PTR(err); } } else { struct xsk_addrs *xsk_addr; if (xsk_skb_destructor_is_addr(skb)) { xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); if (!xsk_addr) return ERR_PTR(-ENOMEM); xsk_addr->num_descs = 1; xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; } else { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; } /* in case of -EOVERFLOW that could happen below, * xsk_consume_skb() will release this node as whole skb * would be dropped, which implies freeing all list elements */ xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; } len = desc->len; ts = pool->unaligned ? len : pool->chunk_size; offset = offset_in_page(buffer); addr = buffer - pool->addrs; for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { if (unlikely(i >= MAX_SKB_FRAGS)) return ERR_PTR(-EOVERFLOW); page = pool->umem->pgs[addr >> PAGE_SHIFT]; get_page(page); copy = min_t(u32, PAGE_SIZE - offset, len - copied); skb_fill_page_desc(skb, i, page, offset, copy); copied += copy; addr += copy; offset = 0; } skb->len += len; skb->data_len += len; skb->truesize += ts; refcount_add(ts, &xs->sk.sk_wmem_alloc); return skb; } static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { struct net_device *dev = xs->dev; struct sk_buff *skb = xs->skb; int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { skb = xsk_build_skb_zerocopy(xs, desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); skb = NULL; goto free_err; } } else { u32 hr, tr, len; void *buffer; buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); len = desc->len; if (!skb) { hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); tr = dev->needed_tailroom; skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); if (unlikely(!skb)) goto free_err; skb_reserve(skb, hr); skb_put(skb, len); err = skb_store_bits(skb, 0, buffer, len); if (unlikely(err)) goto free_err; xsk_skb_init_misc(skb, xs, desc->addr); if (desc->options & XDP_TX_METADATA) { err = xsk_skb_metadata(skb, buffer, desc, xs->pool, hr); if (unlikely(err)) goto free_err; } } else { int nr_frags = skb_shinfo(skb)->nr_frags; struct xsk_addrs *xsk_addr; struct page *page; u8 *vaddr; if (xsk_skb_destructor_is_addr(skb)) { xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); if (!xsk_addr) { err = -ENOMEM; goto free_err; } xsk_addr->num_descs = 1; xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; } else { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; } if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { err = -EOVERFLOW; goto free_err; } page = alloc_page(xs->sk.sk_allocation); if (unlikely(!page)) { err = -EAGAIN; goto free_err; } vaddr = kmap_local_page(page); memcpy(vaddr, buffer, len); kunmap_local(vaddr); skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; } } xsk_inc_num_desc(skb); return skb; free_err: if (skb && !skb_shinfo(skb)->nr_frags) kfree_skb(skb); if (err == -EOVERFLOW) { /* Drop the packet */ xsk_inc_num_desc(xs->skb); xsk_drop_skb(xs->skb); xskq_cons_release(xs->tx); } else { /* Let application retry */ xsk_cq_cancel_locked(xs->pool, 1); } return ERR_PTR(err); } static int __xsk_generic_xmit(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; u32 max_batch; int err = 0; mutex_lock(&xs->mutex); /* Since we dropped the RCU read lock, the socket state might have changed. */ if (unlikely(!xsk_is_bound(xs))) { err = -ENXIO; goto out; } if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; max_batch = READ_ONCE(xs->max_tx_budget); while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { if (max_batch-- == 0) { err = -EAGAIN; goto out; } /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ err = xsk_cq_reserve_locked(xs->pool); if (err) { err = -EAGAIN; goto out; } skb = xsk_build_skb(xs, &desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); if (err != -EOVERFLOW) goto out; err = 0; continue; } xskq_cons_release(xs->tx); if (xp_mb_desc(&desc)) { xs->skb = skb; continue; } err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); xsk_consume_skb(skb); err = -EAGAIN; goto out; } /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP) { /* SKB completed but not sent */ err = -EBUSY; xs->skb = NULL; goto out; } sent_frame = true; xs->skb = NULL; } if (xskq_has_descs(xs->tx)) { if (xs->skb) xsk_drop_skb(xs->skb); xskq_cons_release(xs->tx); } out: if (sent_frame) __xsk_tx_release(xs); mutex_unlock(&xs->mutex); return err; } static int xsk_generic_xmit(struct sock *sk) { int ret; /* Drop the RCU lock since the SKB path might sleep. */ rcu_read_unlock(); ret = __xsk_generic_xmit(sk); /* Reaquire RCU lock before going into common code. */ rcu_read_lock(); return ret; } static bool xsk_no_wakeup(struct sock *sk) { #ifdef CONFIG_NET_RX_BUSY_POLL /* Prefer busy-polling, skip the wakeup. */ return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && napi_id_valid(READ_ONCE(sk->sk_napi_id)); #else return false; #endif } static int xsk_check_common(struct xdp_sock *xs) { if (unlikely(!xsk_is_bound(xs))) return -ENXIO; if (unlikely(!(xs->dev->flags & IFF_UP))) return -ENETDOWN; return 0; } static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { bool need_wait = !(m->msg_flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct xsk_buff_pool *pool; int err; err = xsk_check_common(xs); if (err) return err; if (unlikely(need_wait)) return -EOPNOTSUPP; if (unlikely(!xs->tx)) return -ENOBUFS; if (sk_can_busy_loop(sk)) sk_busy_loop(sk, 1); /* only support non-blocking sockets */ if (xs->zc && xsk_no_wakeup(sk)) return 0; pool = xs->pool; if (pool->cached_need_wakeup & XDP_WAKEUP_TX) { if (xs->zc) return xsk_wakeup(xs, XDP_WAKEUP_TX); return xsk_generic_xmit(sk); } return 0; } static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { int ret; rcu_read_lock(); ret = __xsk_sendmsg(sock, m, total_len); rcu_read_unlock(); return ret; } static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { bool need_wait = !(flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); int err; err = xsk_check_common(xs); if (err) return err; if (unlikely(!xs->rx)) return -ENOBUFS; if (unlikely(need_wait)) return -EOPNOTSUPP; if (sk_can_busy_loop(sk)) sk_busy_loop(sk, 1); /* only support non-blocking sockets */ if (xsk_no_wakeup(sk)) return 0; if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) return xsk_wakeup(xs, XDP_WAKEUP_RX); return 0; } static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { int ret; rcu_read_lock(); ret = __xsk_recvmsg(sock, m, len, flags); rcu_read_unlock(); return ret; } static __poll_t xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { __poll_t mask = 0; struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct xsk_buff_pool *pool; sock_poll_wait(file, sock, wait); rcu_read_lock(); if (xsk_check_common(xs)) goto out; pool = xs->pool; if (pool->cached_need_wakeup) { if (xs->zc) xsk_wakeup(xs, pool->cached_need_wakeup); else if (xs->tx) /* Poll needs to drive Tx also in copy mode */ xsk_generic_xmit(sk); } if (xs->rx && !xskq_prod_is_empty(xs->rx)) mask |= EPOLLIN | EPOLLRDNORM; if (xs->tx && xsk_tx_writeable(xs)) mask |= EPOLLOUT | EPOLLWRNORM; out: rcu_read_unlock(); return mask; } static int xsk_init_queue(u32 entries, struct xsk_queue **queue, bool umem_queue) { struct xsk_queue *q; if (entries == 0 || *queue || !is_power_of_2(entries)) return -EINVAL; q = xskq_create(entries, umem_queue); if (!q) return -ENOMEM; /* Make sure queue is ready before it can be seen by others */ smp_wmb(); WRITE_ONCE(*queue, q); return 0; } static void xsk_unbind_dev(struct xdp_sock *xs) { struct net_device *dev = xs->dev; if (xs->state != XSK_BOUND) return; WRITE_ONCE(xs->state, XSK_UNBOUND); /* Wait for driver to stop using the xdp socket. */ xp_del_xsk(xs->pool, xs); synchronize_net(); dev_put(dev); } static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, struct xdp_sock __rcu ***map_entry) { struct xsk_map *map = NULL; struct xsk_map_node *node; *map_entry = NULL; spin_lock_bh(&xs->map_list_lock); node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, node); if (node) { bpf_map_inc(&node->map->map); map = node->map; *map_entry = node->map_entry; } spin_unlock_bh(&xs->map_list_lock); return map; } static void xsk_delete_from_maps(struct xdp_sock *xs) { /* This function removes the current XDP socket from all the * maps it resides in. We need to take extra care here, due to * the two locks involved. Each map has a lock synchronizing * updates to the entries, and each socket has a lock that * synchronizes access to the list of maps (map_list). For * deadlock avoidance the locks need to be taken in the order * "map lock"->"socket map list lock". We start off by * accessing the socket map list, and take a reference to the * map to guarantee existence between the * xsk_get_map_list_entry() and xsk_map_try_sock_delete() * calls. Then we ask the map to remove the socket, which * tries to remove the socket from the map. Note that there * might be updates to the map between * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). */ struct xdp_sock __rcu **map_entry = NULL; struct xsk_map *map; while ((map = xsk_get_map_list_entry(xs, &map_entry))) { xsk_map_try_sock_delete(map, xs, map_entry); bpf_map_put(&map->map); } } static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net *net; if (!sk) return 0; net = sock_net(sk); if (xs->skb) xsk_drop_skb(xs->skb); mutex_lock(&net->xdp.lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->xdp.lock); sock_prot_inuse_add(net, sk->sk_prot, -1); xsk_delete_from_maps(xs); mutex_lock(&xs->mutex); xsk_unbind_dev(xs); mutex_unlock(&xs->mutex); xskq_destroy(xs->rx); xskq_destroy(xs->tx); xskq_destroy(xs->fq_tmp); xskq_destroy(xs->cq_tmp); sock_orphan(sk); sock->sk = NULL; sock_put(sk); return 0; } static struct socket *xsk_lookup_xsk_from_fd(int fd) { struct socket *sock; int err; sock = sockfd_lookup(fd, &err); if (!sock) return ERR_PTR(-ENOTSOCK); if (sock->sk->sk_family != PF_XDP) { sockfd_put(sock); return ERR_PTR(-ENOPROTOOPT); } return sock; } static bool xsk_validate_queues(struct xdp_sock *xs) { return xs->fq_tmp && xs->cq_tmp; } static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev; int bound_dev_if; u32 flags, qid; int err = 0; if (addr_len < sizeof(struct sockaddr_xdp)) return -EINVAL; if (sxdp->sxdp_family != AF_XDP) return -EINVAL; flags = sxdp->sxdp_flags; if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | XDP_USE_NEED_WAKEUP | XDP_USE_SG)) return -EINVAL; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex) return -EINVAL; rtnl_lock(); mutex_lock(&xs->mutex); if (xs->state != XSK_READY) { err = -EBUSY; goto out_release; } dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); if (!dev) { err = -ENODEV; goto out_release; } netdev_lock_ops(dev); if (!xs->rx && !xs->tx) { err = -EINVAL; goto out_unlock; } qid = sxdp->sxdp_queue_id; if (flags & XDP_SHARED_UMEM) { struct xdp_sock *umem_xs; struct socket *sock; if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { /* Cannot specify flags for shared sockets. */ err = -EINVAL; goto out_unlock; } if (xs->umem) { /* We have already our own. */ err = -EINVAL; goto out_unlock; } sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); if (IS_ERR(sock)) { err = PTR_ERR(sock); goto out_unlock; } umem_xs = xdp_sk(sock->sk); if (!xsk_is_bound(umem_xs)) { err = -EBADF; sockfd_put(sock); goto out_unlock; } if (umem_xs->queue_id != qid || umem_xs->dev != dev) { /* Share the umem with another socket on another qid * and/or device. */ xs->pool = xp_create_and_assign_umem(xs, umem_xs->umem); if (!xs->pool) { err = -ENOMEM; sockfd_put(sock); goto out_unlock; } err = xp_assign_dev_shared(xs->pool, umem_xs, dev, qid); if (err) { xp_destroy(xs->pool); xs->pool = NULL; sockfd_put(sock); goto out_unlock; } } else { /* Share the buffer pool with the other socket. */ if (xs->fq_tmp || xs->cq_tmp) { /* Do not allow setting your own fq or cq. */ err = -EINVAL; sockfd_put(sock); goto out_unlock; } xp_get_pool(umem_xs->pool); xs->pool = umem_xs->pool; /* If underlying shared umem was created without Tx * ring, allocate Tx descs array that Tx batching API * utilizes */ if (xs->tx && !xs->pool->tx_descs) { err = xp_alloc_tx_descs(xs->pool, xs); if (err) { xp_put_pool(xs->pool); xs->pool = NULL; sockfd_put(sock); goto out_unlock; } } } xdp_get_umem(umem_xs->umem); WRITE_ONCE(xs->umem, umem_xs->umem); sockfd_put(sock); } else if (!xs->umem || !xsk_validate_queues(xs)) { err = -EINVAL; goto out_unlock; } else { /* This xsk has its own umem. */ xs->pool = xp_create_and_assign_umem(xs, xs->umem); if (!xs->pool) { err = -ENOMEM; goto out_unlock; } err = xp_assign_dev(xs->pool, dev, qid, flags); if (err) { xp_destroy(xs->pool); xs->pool = NULL; goto out_unlock; } } /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ xs->fq_tmp = NULL; xs->cq_tmp = NULL; xs->dev = dev; xs->zc = xs->umem->zc; xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG); xs->queue_id = qid; xp_add_xsk(xs->pool, xs); if (qid < dev->real_num_rx_queues) { struct netdev_rx_queue *rxq; rxq = __netif_get_rx_queue(dev, qid); if (rxq->napi) __sk_mark_napi_id_once(sk, rxq->napi->napi_id); } out_unlock: if (err) { dev_put(dev); } else { /* Matches smp_rmb() in bind() for shared umem * sockets, and xsk_is_bound(). */ smp_wmb(); WRITE_ONCE(xs->state, XSK_BOUND); } netdev_unlock_ops(dev); out_release: mutex_unlock(&xs->mutex); rtnl_unlock(); return err; } struct xdp_umem_reg_v1 { __u64 addr; /* Start of packet data area */ __u64 len; /* Length of packet data area */ __u32 chunk_size; __u32 headroom; }; static int xsk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); int err; if (level != SOL_XDP) return -ENOPROTOOPT; switch (optname) { case XDP_RX_RING: case XDP_TX_RING: { struct xsk_queue **q; int entries; if (optlen < sizeof(entries)) return -EINVAL; if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); if (xs->state != XSK_READY) { mutex_unlock(&xs->mutex); return -EBUSY; } q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; err = xsk_init_queue(entries, q, false); if (!err && optname == XDP_TX_RING) /* Tx needs to be explicitly woken up the first time */ xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; mutex_unlock(&xs->mutex); return err; } case XDP_UMEM_REG: { size_t mr_size = sizeof(struct xdp_umem_reg); struct xdp_umem_reg mr = {}; struct xdp_umem *umem; if (optlen < sizeof(struct xdp_umem_reg_v1)) return -EINVAL; else if (optlen < sizeof(mr)) mr_size = sizeof(struct xdp_umem_reg_v1); BUILD_BUG_ON(sizeof(struct xdp_umem_reg_v1) >= sizeof(struct xdp_umem_reg)); /* Make sure the last field of the struct doesn't have * uninitialized padding. All padding has to be explicit * and has to be set to zero by the userspace to make * struct xdp_umem_reg extensible in the future. */ BUILD_BUG_ON(offsetof(struct xdp_umem_reg, tx_metadata_len) + sizeof_field(struct xdp_umem_reg, tx_metadata_len) != sizeof(struct xdp_umem_reg)); if (copy_from_sockptr(&mr, optval, mr_size)) return -EFAULT; mutex_lock(&xs->mutex); if (xs->state != XSK_READY || xs->umem) { mutex_unlock(&xs->mutex); return -EBUSY; } umem = xdp_umem_create(&mr); if (IS_ERR(umem)) { mutex_unlock(&xs->mutex); return PTR_ERR(umem); } /* Make sure umem is ready before it can be seen by others */ smp_wmb(); WRITE_ONCE(xs->umem, umem); mutex_unlock(&xs->mutex); return 0; } case XDP_UMEM_FILL_RING: case XDP_UMEM_COMPLETION_RING: { struct xsk_queue **q; int entries; if (optlen < sizeof(entries)) return -EINVAL; if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); if (xs->state != XSK_READY) { mutex_unlock(&xs->mutex); return -EBUSY; } q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp : &xs->cq_tmp; err = xsk_init_queue(entries, q, true); mutex_unlock(&xs->mutex); return err; } case XDP_MAX_TX_SKB_BUDGET: { unsigned int budget; if (optlen != sizeof(budget)) return -EINVAL; if (copy_from_sockptr(&budget, optval, sizeof(budget))) return -EFAULT; if (!xs->tx || budget < TX_BATCH_SIZE || budget > xs->tx->nentries) return -EACCES; WRITE_ONCE(xs->max_tx_budget, budget); return 0; } default: break; } return -ENOPROTOOPT; } static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring) { ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); ring->desc = offsetof(struct xdp_rxtx_ring, desc); } static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) { ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer); ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); ring->desc = offsetof(struct xdp_umem_ring, desc); } struct xdp_statistics_v1 { __u64 rx_dropped; __u64 rx_invalid_descs; __u64 tx_invalid_descs; }; static int xsk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); int len; if (level != SOL_XDP) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case XDP_STATISTICS: { struct xdp_statistics stats = {}; bool extra_stats = true; size_t stats_size; if (len < sizeof(struct xdp_statistics_v1)) { return -EINVAL; } else if (len < sizeof(stats)) { extra_stats = false; stats_size = sizeof(struct xdp_statistics_v1); } else { stats_size = sizeof(stats); } mutex_lock(&xs->mutex); stats.rx_dropped = xs->rx_dropped; if (extra_stats) { stats.rx_ring_full = xs->rx_queue_full; stats.rx_fill_ring_empty_descs = xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); } else { stats.rx_dropped += xs->rx_queue_full; } stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); mutex_unlock(&xs->mutex); if (copy_to_user(optval, &stats, stats_size)) return -EFAULT; if (put_user(stats_size, optlen)) return -EFAULT; return 0; } case XDP_MMAP_OFFSETS: { struct xdp_mmap_offsets off; struct xdp_mmap_offsets_v1 off_v1; bool flags_supported = true; void *to_copy; if (len < sizeof(off_v1)) return -EINVAL; else if (len < sizeof(off)) flags_supported = false; if (flags_supported) { /* xdp_ring_offset is identical to xdp_ring_offset_v1 * except for the flags field added to the end. */ xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) &off.rx); xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) &off.tx); xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) &off.fr); xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) &off.cr); off.rx.flags = offsetof(struct xdp_rxtx_ring, ptrs.flags); off.tx.flags = offsetof(struct xdp_rxtx_ring, ptrs.flags); off.fr.flags = offsetof(struct xdp_umem_ring, ptrs.flags); off.cr.flags = offsetof(struct xdp_umem_ring, ptrs.flags); len = sizeof(off); to_copy = &off; } else { xsk_enter_rxtx_offsets(&off_v1.rx); xsk_enter_rxtx_offsets(&off_v1.tx); xsk_enter_umem_offsets(&off_v1.fr); xsk_enter_umem_offsets(&off_v1.cr); len = sizeof(off_v1); to_copy = &off_v1; } if (copy_to_user(optval, to_copy, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } case XDP_OPTIONS: { struct xdp_options opts = {}; if (len < sizeof(opts)) return -EINVAL; mutex_lock(&xs->mutex); if (xs->zc) opts.flags |= XDP_OPTIONS_ZEROCOPY; mutex_unlock(&xs->mutex); len = sizeof(opts); if (copy_to_user(optval, &opts, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } default: break; } return -EOPNOTSUPP; } static int xsk_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; unsigned long size = vma->vm_end - vma->vm_start; struct xdp_sock *xs = xdp_sk(sock->sk); int state = READ_ONCE(xs->state); struct xsk_queue *q = NULL; if (state != XSK_READY && state != XSK_BOUND) return -EBUSY; if (offset == XDP_PGOFF_RX_RING) { q = READ_ONCE(xs->rx); } else if (offset == XDP_PGOFF_TX_RING) { q = READ_ONCE(xs->tx); } else { /* Matches the smp_wmb() in XDP_UMEM_REG */ smp_rmb(); if (offset == XDP_UMEM_PGOFF_FILL_RING) q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) : READ_ONCE(xs->pool->fq); else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) : READ_ONCE(xs->pool->cq); } if (!q) return -EINVAL; /* Matches the smp_wmb() in xsk_init_queue */ smp_rmb(); if (size > q->ring_vmalloc_size) return -EINVAL; return remap_vmalloc_range(vma, q->ring, 0); } static int xsk_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct sock *sk; switch (msg) { case NETDEV_UNREGISTER: mutex_lock(&net->xdp.lock); sk_for_each(sk, &net->xdp.list) { struct xdp_sock *xs = xdp_sk(sk); mutex_lock(&xs->mutex); if (xs->dev == dev) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); xsk_unbind_dev(xs); /* Clear device references. */ xp_clear_dev(xs->pool); } mutex_unlock(&xs->mutex); } mutex_unlock(&net->xdp.lock); break; } return NOTIFY_DONE; } static struct proto xsk_proto = { .name = "XDP", .owner = THIS_MODULE, .obj_size = sizeof(struct xdp_sock), }; static const struct proto_ops xsk_proto_ops = { .family = PF_XDP, .owner = THIS_MODULE, .release = xsk_release, .bind = xsk_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = xsk_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = xsk_setsockopt, .getsockopt = xsk_getsockopt, .sendmsg = xsk_sendmsg, .recvmsg = xsk_recvmsg, .mmap = xsk_mmap, }; static void xsk_destruct(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); if (!sock_flag(sk, SOCK_DEAD)) return; if (!xp_put_pool(xs->pool)) xdp_put_umem(xs->umem, !xs->pool); } static int xsk_create(struct net *net, struct socket *sock, int protocol, int kern) { struct xdp_sock *xs; struct sock *sk; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (protocol) return -EPROTONOSUPPORT; sock->state = SS_UNCONNECTED; sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); if (!sk) return -ENOBUFS; sock->ops = &xsk_proto_ops; sock_init_data(sock, sk); sk->sk_family = PF_XDP; sk->sk_destruct = xsk_destruct; sock_set_flag(sk, SOCK_RCU_FREE); xs = xdp_sk(sk); xs->state = XSK_READY; xs->max_tx_budget = TX_BATCH_SIZE; mutex_init(&xs->mutex); INIT_LIST_HEAD(&xs->map_list); spin_lock_init(&xs->map_list_lock); mutex_lock(&net->xdp.lock); sk_add_node_rcu(sk, &net->xdp.list); mutex_unlock(&net->xdp.lock); sock_prot_inuse_add(net, &xsk_proto, 1); return 0; } static const struct net_proto_family xsk_family_ops = { .family = PF_XDP, .create = xsk_create, .owner = THIS_MODULE, }; static struct notifier_block xsk_netdev_notifier = { .notifier_call = xsk_notifier, }; static int __net_init xsk_net_init(struct net *net) { mutex_init(&net->xdp.lock); INIT_HLIST_HEAD(&net->xdp.list); return 0; } static void __net_exit xsk_net_exit(struct net *net) { WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); } static struct pernet_operations xsk_net_ops = { .init = xsk_net_init, .exit = xsk_net_exit, }; static int __init xsk_init(void) { int err; err = proto_register(&xsk_proto, 0 /* no slab */); if (err) goto out; err = sock_register(&xsk_family_ops); if (err) goto out_proto; err = register_pernet_subsys(&xsk_net_ops); if (err) goto out_sk; err = register_netdevice_notifier(&xsk_netdev_notifier); if (err) goto out_pernet; xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", sizeof(struct xsk_addrs), 0, SLAB_HWCACHE_ALIGN, NULL); if (!xsk_tx_generic_cache) { err = -ENOMEM; goto out_unreg_notif; } return 0; out_unreg_notif: unregister_netdevice_notifier(&xsk_netdev_notifier); out_pernet: unregister_pernet_subsys(&xsk_net_ops); out_sk: sock_unregister(PF_XDP); out_proto: proto_unregister(&xsk_proto); out: return err; } fs_initcall(xsk_init);
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 1 3 3 1 1 3 3 3 2 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 1 3 3 3 3 3 3 2 2 3 2 3 2 1 1 1 1 3 1 1 3 1 1 3 3 3 3 3 2 3 3 3 3 3 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 /* * Copyright (c) 2006-2008 Intel Corporation * Copyright (c) 2007 Dave Airlie <airlied@linux.ie> * * DRM core CRTC related functions * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. * * Authors: * Keith Packard * Eric Anholt <eric@anholt.net> * Dave Airlie <airlied@linux.ie> * Jesse Barnes <jesse.barnes@intel.com> */ #include <linux/export.h> #include <linux/moduleparam.h> #include <drm/drm_bridge.h> #include <drm/drm_client_event.h> #include <drm/drm_crtc.h> #include <drm/drm_edid.h> #include <drm/drm_fourcc.h> #include <drm/drm_managed.h> #include <drm/drm_modeset_helper_vtables.h> #include <drm/drm_print.h> #include <drm/drm_probe_helper.h> #include <drm/drm_sysfs.h> #include "drm_crtc_helper_internal.h" /** * DOC: output probing helper overview * * This library provides some helper code for output probing. It provides an * implementation of the core &drm_connector_funcs.fill_modes interface with * drm_helper_probe_single_connector_modes(). * * It also provides support for polling connectors with a work item and for * generic hotplug interrupt handling where the driver doesn't or cannot keep * track of a per-connector hpd interrupt. * * This helper library can be used independently of the modeset helper library. * Drivers can also overwrite different parts e.g. use their own hotplug * handling code to avoid probing unrelated outputs. * * The probe helpers share the function table structures with other display * helper libraries. See &struct drm_connector_helper_funcs for the details. */ static bool drm_kms_helper_poll = true; module_param_named(poll, drm_kms_helper_poll, bool, 0600); static enum drm_mode_status drm_mode_validate_flag(const struct drm_display_mode *mode, int flags) { if ((mode->flags & DRM_MODE_FLAG_INTERLACE) && !(flags & DRM_MODE_FLAG_INTERLACE)) return MODE_NO_INTERLACE; if ((mode->flags & DRM_MODE_FLAG_DBLSCAN) && !(flags & DRM_MODE_FLAG_DBLSCAN)) return MODE_NO_DBLESCAN; if ((mode->flags & DRM_MODE_FLAG_3D_MASK) && !(flags & DRM_MODE_FLAG_3D_MASK)) return MODE_NO_STEREO; return MODE_OK; } static int drm_mode_validate_pipeline(struct drm_display_mode *mode, struct drm_connector *connector, struct drm_modeset_acquire_ctx *ctx, enum drm_mode_status *status) { struct drm_device *dev = connector->dev; struct drm_encoder *encoder; int ret; /* Step 1: Validate against connector */ ret = drm_connector_mode_valid(connector, mode, ctx, status); if (ret || *status != MODE_OK) return ret; /* Step 2: Validate against encoders and crtcs */ drm_connector_for_each_possible_encoder(connector, encoder) { struct drm_bridge *bridge; struct drm_crtc *crtc; *status = drm_encoder_mode_valid(encoder, mode); if (*status != MODE_OK) { /* No point in continuing for crtc check as this encoder * will not accept the mode anyway. If all encoders * reject the mode then, at exit, ret will not be * MODE_OK. */ continue; } bridge = drm_bridge_chain_get_first_bridge(encoder); *status = drm_bridge_chain_mode_valid(bridge, &connector->display_info, mode); drm_bridge_put(bridge); if (*status != MODE_OK) { /* There is also no point in continuing for crtc check * here. */ continue; } drm_for_each_crtc(crtc, dev) { if (!drm_encoder_crtc_ok(encoder, crtc)) continue; *status = drm_crtc_mode_valid(crtc, mode); if (*status == MODE_OK) { /* If we get to this point there is at least * one combination of encoder+crtc that works * for this mode. Lets return now. */ return 0; } } } return 0; } static int drm_helper_probe_add_cmdline_mode(struct drm_connector *connector) { struct drm_cmdline_mode *cmdline_mode; struct drm_display_mode *mode; cmdline_mode = &connector->cmdline_mode; if (!cmdline_mode->specified) return 0; /* Only add a GTF mode if we find no matching probed modes */ list_for_each_entry(mode, &connector->probed_modes, head) { if (mode->hdisplay != cmdline_mode->xres || mode->vdisplay != cmdline_mode->yres) continue; if (cmdline_mode->refresh_specified) { /* The probed mode's vrefresh is set until later */ if (drm_mode_vrefresh(mode) != cmdline_mode->refresh) continue; } /* Mark the matching mode as being preferred by the user */ mode->type |= DRM_MODE_TYPE_USERDEF; return 0; } mode = drm_mode_create_from_cmdline_mode(connector->dev, cmdline_mode); if (mode == NULL) return 0; drm_mode_probed_add(connector, mode); return 1; } enum drm_mode_status drm_crtc_mode_valid(struct drm_crtc *crtc, const struct drm_display_mode *mode) { const struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; if (!crtc_funcs || !crtc_funcs->mode_valid) return MODE_OK; return crtc_funcs->mode_valid(crtc, mode); } enum drm_mode_status drm_encoder_mode_valid(struct drm_encoder *encoder, const struct drm_display_mode *mode) { const struct drm_encoder_helper_funcs *encoder_funcs = encoder->helper_private; if (!encoder_funcs || !encoder_funcs->mode_valid) return MODE_OK; return encoder_funcs->mode_valid(encoder, mode); } int drm_connector_mode_valid(struct drm_connector *connector, const struct drm_display_mode *mode, struct drm_modeset_acquire_ctx *ctx, enum drm_mode_status *status) { const struct drm_connector_helper_funcs *connector_funcs = connector->helper_private; int ret = 0; if (!connector_funcs) *status = MODE_OK; else if (connector_funcs->mode_valid_ctx) ret = connector_funcs->mode_valid_ctx(connector, mode, ctx, status); else if (connector_funcs->mode_valid) *status = connector_funcs->mode_valid(connector, mode); else *status = MODE_OK; return ret; } static void drm_kms_helper_disable_hpd(struct drm_device *dev) { struct drm_connector *connector; struct drm_connector_list_iter conn_iter; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { const struct drm_connector_helper_funcs *funcs = connector->helper_private; if (funcs && funcs->disable_hpd) funcs->disable_hpd(connector); } drm_connector_list_iter_end(&conn_iter); } static bool drm_kms_helper_enable_hpd(struct drm_device *dev) { bool poll = false; struct drm_connector *connector; struct drm_connector_list_iter conn_iter; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { const struct drm_connector_helper_funcs *funcs = connector->helper_private; if (funcs && funcs->enable_hpd) funcs->enable_hpd(connector); if (connector->polled & (DRM_CONNECTOR_POLL_CONNECT | DRM_CONNECTOR_POLL_DISCONNECT)) poll = true; } drm_connector_list_iter_end(&conn_iter); return poll; } #define DRM_OUTPUT_POLL_PERIOD (10*HZ) static void reschedule_output_poll_work(struct drm_device *dev) { unsigned long delay = DRM_OUTPUT_POLL_PERIOD; if (dev->mode_config.delayed_event) /* * FIXME: * * Use short (1s) delay to handle the initial delayed event. * This delay should not be needed, but Optimus/nouveau will * fail in a mysterious way if the delayed event is handled as * soon as possible like it is done in * drm_helper_probe_single_connector_modes() in case the poll * was enabled before. */ delay = HZ; schedule_delayed_work(&dev->mode_config.output_poll_work, delay); } /** * drm_kms_helper_poll_enable - re-enable output polling. * @dev: drm_device * * This function re-enables the output polling work, after it has been * temporarily disabled using drm_kms_helper_poll_disable(), for example over * suspend/resume. * * Drivers can call this helper from their device resume implementation. It is * not an error to call this even when output polling isn't enabled. * * If device polling was never initialized before, this call will trigger a * warning and return. * * Note that calls to enable and disable polling must be strictly ordered, which * is automatically the case when they're only call from suspend/resume * callbacks. */ void drm_kms_helper_poll_enable(struct drm_device *dev) { if (drm_WARN_ON_ONCE(dev, !dev->mode_config.poll_enabled) || !drm_kms_helper_poll || dev->mode_config.poll_running) return; if (drm_kms_helper_enable_hpd(dev) || dev->mode_config.delayed_event) reschedule_output_poll_work(dev); dev->mode_config.poll_running = true; } EXPORT_SYMBOL(drm_kms_helper_poll_enable); /** * drm_kms_helper_poll_reschedule - reschedule the output polling work * @dev: drm_device * * This function reschedules the output polling work, after polling for a * connector has been enabled. * * Drivers must call this helper after enabling polling for a connector by * setting %DRM_CONNECTOR_POLL_CONNECT / %DRM_CONNECTOR_POLL_DISCONNECT flags * in drm_connector::polled. Note that after disabling polling by clearing these * flags for a connector will stop the output polling work automatically if * the polling is disabled for all other connectors as well. * * The function can be called only after polling has been enabled by calling * drm_kms_helper_poll_init() / drm_kms_helper_poll_enable(). */ void drm_kms_helper_poll_reschedule(struct drm_device *dev) { if (dev->mode_config.poll_running) reschedule_output_poll_work(dev); } EXPORT_SYMBOL(drm_kms_helper_poll_reschedule); static int detect_connector_status(struct drm_connector *connector, struct drm_modeset_acquire_ctx *ctx, bool force) { const struct drm_connector_helper_funcs *funcs = connector->helper_private; if (funcs->detect_ctx) return funcs->detect_ctx(connector, ctx, force); else if (connector->funcs->detect) return connector->funcs->detect(connector, force); return connector_status_connected; } static enum drm_connector_status drm_helper_probe_detect_ctx(struct drm_connector *connector, bool force) { struct drm_modeset_acquire_ctx ctx; int ret; drm_modeset_acquire_init(&ctx, 0); retry: ret = drm_modeset_lock(&connector->dev->mode_config.connection_mutex, &ctx); if (!ret) ret = detect_connector_status(connector, &ctx, force); if (ret == -EDEADLK) { drm_modeset_backoff(&ctx); goto retry; } if (WARN_ON(ret < 0)) ret = connector_status_unknown; if (ret != connector->status) connector->epoch_counter += 1; drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); return ret; } /** * drm_helper_probe_detect - probe connector status * @connector: connector to probe * @ctx: acquire_ctx, or NULL to let this function handle locking. * @force: Whether destructive probe operations should be performed. * * This function calls the detect callbacks of the connector. * This function returns &drm_connector_status, or * if @ctx is set, it might also return -EDEADLK. */ int drm_helper_probe_detect(struct drm_connector *connector, struct drm_modeset_acquire_ctx *ctx, bool force) { struct drm_device *dev = connector->dev; int ret; if (!ctx) return drm_helper_probe_detect_ctx(connector, force); ret = drm_modeset_lock(&dev->mode_config.connection_mutex, ctx); if (ret) return ret; ret = detect_connector_status(connector, ctx, force); if (ret != connector->status) connector->epoch_counter += 1; return ret; } EXPORT_SYMBOL(drm_helper_probe_detect); static int drm_helper_probe_get_modes(struct drm_connector *connector) { const struct drm_connector_helper_funcs *connector_funcs = connector->helper_private; int count; count = connector_funcs->get_modes(connector); /* The .get_modes() callback should not return negative values. */ if (count < 0) { drm_err(connector->dev, ".get_modes() returned %pe\n", ERR_PTR(count)); count = 0; } /* * Fallback for when DDC probe failed in drm_get_edid() and thus skipped * override/firmware EDID. */ if (count == 0 && connector->status == connector_status_connected) count = drm_edid_override_connector_update(connector); return count; } static int __drm_helper_update_and_validate(struct drm_connector *connector, uint32_t maxX, uint32_t maxY, struct drm_modeset_acquire_ctx *ctx) { struct drm_device *dev = connector->dev; struct drm_display_mode *mode; int mode_flags = 0; int ret; drm_connector_list_update(connector); if (connector->interlace_allowed) mode_flags |= DRM_MODE_FLAG_INTERLACE; if (connector->doublescan_allowed) mode_flags |= DRM_MODE_FLAG_DBLSCAN; if (connector->stereo_allowed) mode_flags |= DRM_MODE_FLAG_3D_MASK; list_for_each_entry(mode, &connector->modes, head) { if (mode->status != MODE_OK) continue; mode->status = drm_mode_validate_driver(dev, mode); if (mode->status != MODE_OK) continue; mode->status = drm_mode_validate_size(mode, maxX, maxY); if (mode->status != MODE_OK) continue; mode->status = drm_mode_validate_flag(mode, mode_flags); if (mode->status != MODE_OK) continue; mode->status = drm_mode_validate_ycbcr420(mode, connector); if (mode->status != MODE_OK) continue; ret = drm_mode_validate_pipeline(mode, connector, ctx, &mode->status); if (ret) { drm_dbg_kms(dev, "drm_mode_validate_pipeline failed: %d\n", ret); if (drm_WARN_ON_ONCE(dev, ret != -EDEADLK)) mode->status = MODE_ERROR; else return -EDEADLK; } } return 0; } /** * drm_helper_probe_single_connector_modes - get complete set of display modes * @connector: connector to probe * @maxX: max width for modes * @maxY: max height for modes * * Based on the helper callbacks implemented by @connector in struct * &drm_connector_helper_funcs try to detect all valid modes. Modes will first * be added to the connector's probed_modes list, then culled (based on validity * and the @maxX, @maxY parameters) and put into the normal modes list. * * Intended to be used as a generic implementation of the * &drm_connector_funcs.fill_modes() vfunc for drivers that use the CRTC helpers * for output mode filtering and detection. * * The basic procedure is as follows * * 1. All modes currently on the connector's modes list are marked as stale * * 2. New modes are added to the connector's probed_modes list with * drm_mode_probed_add(). New modes start their life with status as OK. * Modes are added from a single source using the following priority order. * * - &drm_connector_helper_funcs.get_modes vfunc * - if the connector status is connector_status_connected, standard * VESA DMT modes up to 1024x768 are automatically added * (drm_add_modes_noedid()) * * Finally modes specified via the kernel command line (video=...) are * added in addition to what the earlier probes produced * (drm_helper_probe_add_cmdline_mode()). These modes are generated * using the VESA GTF/CVT formulas. * * 3. Modes are moved from the probed_modes list to the modes list. Potential * duplicates are merged together (see drm_connector_list_update()). * After this step the probed_modes list will be empty again. * * 4. Any non-stale mode on the modes list then undergoes validation * * - drm_mode_validate_basic() performs basic sanity checks * - drm_mode_validate_size() filters out modes larger than @maxX and @maxY * (if specified) * - drm_mode_validate_flag() checks the modes against basic connector * capabilities (interlace_allowed,doublescan_allowed,stereo_allowed) * - the optional &drm_connector_helper_funcs.mode_valid or * &drm_connector_helper_funcs.mode_valid_ctx helpers can perform driver * and/or sink specific checks * - the optional &drm_crtc_helper_funcs.mode_valid, * &drm_bridge_funcs.mode_valid and &drm_encoder_helper_funcs.mode_valid * helpers can perform driver and/or source specific checks which are also * enforced by the modeset/atomic helpers * * 5. Any mode whose status is not OK is pruned from the connector's modes list, * accompanied by a debug message indicating the reason for the mode's * rejection (see drm_mode_prune_invalid()). * * Returns: * The number of modes found on @connector. */ int drm_helper_probe_single_connector_modes(struct drm_connector *connector, uint32_t maxX, uint32_t maxY) { struct drm_device *dev = connector->dev; struct drm_display_mode *mode; int count = 0, ret; enum drm_connector_status old_status; struct drm_modeset_acquire_ctx ctx; WARN_ON(!mutex_is_locked(&dev->mode_config.mutex)); drm_modeset_acquire_init(&ctx, 0); drm_dbg_kms(dev, "[CONNECTOR:%d:%s]\n", connector->base.id, connector->name); retry: ret = drm_modeset_lock(&dev->mode_config.connection_mutex, &ctx); if (ret == -EDEADLK) { drm_modeset_backoff(&ctx); goto retry; } else WARN_ON(ret < 0); /* set all old modes to the stale state */ list_for_each_entry(mode, &connector->modes, head) mode->status = MODE_STALE; old_status = connector->status; if (connector->force) { if (connector->force == DRM_FORCE_ON || connector->force == DRM_FORCE_ON_DIGITAL) connector->status = connector_status_connected; else connector->status = connector_status_disconnected; if (connector->funcs->force) connector->funcs->force(connector); } else { ret = drm_helper_probe_detect(connector, &ctx, true); if (ret == -EDEADLK) { drm_modeset_backoff(&ctx); goto retry; } else if (WARN(ret < 0, "Invalid return value %i for connector detection\n", ret)) ret = connector_status_unknown; connector->status = ret; } /* * Normally either the driver's hpd code or the poll loop should * pick up any changes and fire the hotplug event. But if * userspace sneaks in a probe, we might miss a change. Hence * check here, and if anything changed start the hotplug code. */ if (old_status != connector->status) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] status updated from %s to %s\n", connector->base.id, connector->name, drm_get_connector_status_name(old_status), drm_get_connector_status_name(connector->status)); /* * The hotplug event code might call into the fb * helpers, and so expects that we do not hold any * locks. Fire up the poll struct instead, it will * disable itself again. */ dev->mode_config.delayed_event = true; if (dev->mode_config.poll_enabled) mod_delayed_work(system_wq, &dev->mode_config.output_poll_work, 0); } /* * Re-enable polling in case the global poll config changed but polling * is still initialized. */ if (dev->mode_config.poll_enabled) drm_kms_helper_poll_enable(dev); if (connector->status == connector_status_disconnected) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] disconnected\n", connector->base.id, connector->name); drm_connector_update_edid_property(connector, NULL); drm_mode_prune_invalid(dev, &connector->modes, false); goto exit; } count = drm_helper_probe_get_modes(connector); if (count == 0 && (connector->status == connector_status_connected || connector->status == connector_status_unknown)) { count = drm_add_modes_noedid(connector, 1024, 768); /* * Section 4.2.2.6 (EDID Corruption Detection) of the DP 1.4a * Link CTS specifies that 640x480 (the official "failsafe" * mode) needs to be the default if there's no EDID. */ if (connector->connector_type == DRM_MODE_CONNECTOR_DisplayPort) drm_set_preferred_mode(connector, 640, 480); } count += drm_helper_probe_add_cmdline_mode(connector); if (count != 0) { ret = __drm_helper_update_and_validate(connector, maxX, maxY, &ctx); if (ret == -EDEADLK) { drm_modeset_backoff(&ctx); goto retry; } } drm_mode_prune_invalid(dev, &connector->modes, true); /* * Displayport spec section 5.2.1.2 ("Video Timing Format") says that * all detachable sinks shall support 640x480 @60Hz as a fail safe * mode. If all modes were pruned, perhaps because they need more * lanes or a higher pixel clock than available, at least try to add * in 640x480. */ if (list_empty(&connector->modes) && connector->connector_type == DRM_MODE_CONNECTOR_DisplayPort) { count = drm_add_modes_noedid(connector, 640, 480); ret = __drm_helper_update_and_validate(connector, maxX, maxY, &ctx); if (ret == -EDEADLK) { drm_modeset_backoff(&ctx); goto retry; } drm_mode_prune_invalid(dev, &connector->modes, true); } exit: drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); if (list_empty(&connector->modes)) return 0; drm_mode_sort(&connector->modes); drm_dbg_kms(dev, "[CONNECTOR:%d:%s] probed modes:\n", connector->base.id, connector->name); list_for_each_entry(mode, &connector->modes, head) { drm_mode_set_crtcinfo(mode, CRTC_INTERLACE_HALVE_V); drm_dbg_kms(dev, "Probed mode: " DRM_MODE_FMT "\n", DRM_MODE_ARG(mode)); } return count; } EXPORT_SYMBOL(drm_helper_probe_single_connector_modes); /** * drm_kms_helper_hotplug_event - fire off KMS hotplug events * @dev: drm_device whose connector state changed * * This function fires off the uevent for userspace and also calls the * client hotplug function, which is most commonly used to inform the fbdev * emulation code and allow it to update the fbcon output configuration. * * Drivers should call this from their hotplug handling code when a change is * detected. Note that this function does not do any output detection of its * own, like drm_helper_hpd_irq_event() does - this is assumed to be done by the * driver already. * * This function must be called from process context with no mode * setting locks held. * * If only a single connector has changed, consider calling * drm_kms_helper_connector_hotplug_event() instead. */ void drm_kms_helper_hotplug_event(struct drm_device *dev) { drm_sysfs_hotplug_event(dev); drm_client_dev_hotplug(dev); } EXPORT_SYMBOL(drm_kms_helper_hotplug_event); /** * drm_kms_helper_connector_hotplug_event - fire off a KMS connector hotplug event * @connector: drm_connector which has changed * * This is the same as drm_kms_helper_hotplug_event(), except it fires a more * fine-grained uevent for a single connector. */ void drm_kms_helper_connector_hotplug_event(struct drm_connector *connector) { struct drm_device *dev = connector->dev; drm_sysfs_connector_hotplug_event(connector); drm_client_dev_hotplug(dev); } EXPORT_SYMBOL(drm_kms_helper_connector_hotplug_event); static void output_poll_execute(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); struct drm_device *dev = container_of(delayed_work, struct drm_device, mode_config.output_poll_work); struct drm_connector *connector; struct drm_connector_list_iter conn_iter; enum drm_connector_status old_status; bool repoll = false, changed; u64 old_epoch_counter; if (!dev->mode_config.poll_enabled) return; /* Pick up any changes detected by the probe functions. */ changed = dev->mode_config.delayed_event; dev->mode_config.delayed_event = false; if (!drm_kms_helper_poll) { if (dev->mode_config.poll_running) { drm_kms_helper_disable_hpd(dev); dev->mode_config.poll_running = false; } goto out; } if (!mutex_trylock(&dev->mode_config.mutex)) { repoll = true; goto out; } drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { /* Ignore forced connectors. */ if (connector->force) continue; /* Ignore HPD capable connectors and connectors where we don't * want any hotplug detection at all for polling. */ if (!connector->polled || connector->polled == DRM_CONNECTOR_POLL_HPD) continue; old_status = connector->status; /* if we are connected and don't want to poll for disconnect skip it */ if (old_status == connector_status_connected && !(connector->polled & DRM_CONNECTOR_POLL_DISCONNECT)) continue; repoll = true; old_epoch_counter = connector->epoch_counter; connector->status = drm_helper_probe_detect(connector, NULL, false); if (old_epoch_counter != connector->epoch_counter) { const char *old, *new; /* * The poll work sets force=false when calling detect so * that drivers can avoid to do disruptive tests (e.g. * when load detect cycles could cause flickering on * other, running displays). This bears the risk that we * flip-flop between unknown here in the poll work and * the real state when userspace forces a full detect * call after receiving a hotplug event due to this * change. * * Hence clamp an unknown detect status to the old * value. */ if (connector->status == connector_status_unknown) { connector->status = old_status; continue; } old = drm_get_connector_status_name(old_status); new = drm_get_connector_status_name(connector->status); drm_dbg_kms(dev, "[CONNECTOR:%d:%s] status updated from %s to %s\n", connector->base.id, connector->name, old, new); drm_dbg_kms(dev, "[CONNECTOR:%d:%s] epoch counter %llu -> %llu\n", connector->base.id, connector->name, old_epoch_counter, connector->epoch_counter); changed = true; } } drm_connector_list_iter_end(&conn_iter); mutex_unlock(&dev->mode_config.mutex); out: if (changed) drm_kms_helper_hotplug_event(dev); if (repoll) schedule_delayed_work(delayed_work, DRM_OUTPUT_POLL_PERIOD); } /** * drm_kms_helper_is_poll_worker - is %current task an output poll worker? * * Determine if %current task is an output poll worker. This can be used * to select distinct code paths for output polling versus other contexts. * * One use case is to avoid a deadlock between the output poll worker and * the autosuspend worker wherein the latter waits for polling to finish * upon calling drm_kms_helper_poll_disable(), while the former waits for * runtime suspend to finish upon calling pm_runtime_get_sync() in a * connector ->detect hook. */ bool drm_kms_helper_is_poll_worker(void) { struct work_struct *work = current_work(); return work && work->func == output_poll_execute; } EXPORT_SYMBOL(drm_kms_helper_is_poll_worker); /** * drm_kms_helper_poll_disable - disable output polling * @dev: drm_device * * This function disables the output polling work. * * Drivers can call this helper from their device suspend implementation. It is * not an error to call this even when output polling isn't enabled or already * disabled. Polling is re-enabled by calling drm_kms_helper_poll_enable(). * * If however, the polling was never initialized, this call will trigger a * warning and return. * * Note that calls to enable and disable polling must be strictly ordered, which * is automatically the case when they're only call from suspend/resume * callbacks. */ void drm_kms_helper_poll_disable(struct drm_device *dev) { if (drm_WARN_ON(dev, !dev->mode_config.poll_enabled)) return; if (dev->mode_config.poll_running) drm_kms_helper_disable_hpd(dev); cancel_delayed_work_sync(&dev->mode_config.output_poll_work); dev->mode_config.poll_running = false; } EXPORT_SYMBOL(drm_kms_helper_poll_disable); /** * drm_kms_helper_poll_init - initialize and enable output polling * @dev: drm_device * * This function initializes and then also enables output polling support for * @dev. Drivers which do not have reliable hotplug support in hardware can use * this helper infrastructure to regularly poll such connectors for changes in * their connection state. * * Drivers can control which connectors are polled by setting the * DRM_CONNECTOR_POLL_CONNECT and DRM_CONNECTOR_POLL_DISCONNECT flags. On * connectors where probing live outputs can result in visual distortion drivers * should not set the DRM_CONNECTOR_POLL_DISCONNECT flag to avoid this. * Connectors which have no flag or only DRM_CONNECTOR_POLL_HPD set are * completely ignored by the polling logic. * * Note that a connector can be both polled and probed from the hotplug handler, * in case the hotplug interrupt is known to be unreliable. */ void drm_kms_helper_poll_init(struct drm_device *dev) { INIT_DELAYED_WORK(&dev->mode_config.output_poll_work, output_poll_execute); dev->mode_config.poll_enabled = true; drm_kms_helper_poll_enable(dev); } EXPORT_SYMBOL(drm_kms_helper_poll_init); /** * drm_kms_helper_poll_fini - disable output polling and clean it up * @dev: drm_device */ void drm_kms_helper_poll_fini(struct drm_device *dev) { if (!dev->mode_config.poll_enabled) return; drm_kms_helper_poll_disable(dev); dev->mode_config.poll_enabled = false; } EXPORT_SYMBOL(drm_kms_helper_poll_fini); static void drm_kms_helper_poll_init_release(struct drm_device *dev, void *res) { drm_kms_helper_poll_fini(dev); } /** * drmm_kms_helper_poll_init - initialize and enable output polling * @dev: drm_device * * This function initializes and then also enables output polling support for * @dev similar to drm_kms_helper_poll_init(). Polling will automatically be * cleaned up when the DRM device goes away. * * See drm_kms_helper_poll_init() for more information. */ void drmm_kms_helper_poll_init(struct drm_device *dev) { int ret; drm_kms_helper_poll_init(dev); ret = drmm_add_action_or_reset(dev, drm_kms_helper_poll_init_release, dev); if (ret) drm_warn(dev, "Connector status will not be updated, error %d\n", ret); } EXPORT_SYMBOL(drmm_kms_helper_poll_init); static bool check_connector_changed(struct drm_connector *connector) { struct drm_device *dev = connector->dev; enum drm_connector_status old_status; u64 old_epoch_counter; /* Only handle HPD capable connectors. */ drm_WARN_ON(dev, !(connector->polled & DRM_CONNECTOR_POLL_HPD)); drm_WARN_ON(dev, !mutex_is_locked(&dev->mode_config.mutex)); old_status = connector->status; old_epoch_counter = connector->epoch_counter; connector->status = drm_helper_probe_detect(connector, NULL, false); if (old_epoch_counter == connector->epoch_counter) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] Same epoch counter %llu\n", connector->base.id, connector->name, connector->epoch_counter); return false; } drm_dbg_kms(dev, "[CONNECTOR:%d:%s] status updated from %s to %s\n", connector->base.id, connector->name, drm_get_connector_status_name(old_status), drm_get_connector_status_name(connector->status)); drm_dbg_kms(dev, "[CONNECTOR:%d:%s] Changed epoch counter %llu => %llu\n", connector->base.id, connector->name, old_epoch_counter, connector->epoch_counter); return true; } /** * drm_connector_helper_hpd_irq_event - hotplug processing * @connector: drm_connector * * Drivers can use this helper function to run a detect cycle on a connector * which has the DRM_CONNECTOR_POLL_HPD flag set in its &polled member. * * This helper function is useful for drivers which can track hotplug * interrupts for a single connector. Drivers that want to send a * hotplug event for all connectors or can't track hotplug interrupts * per connector need to use drm_helper_hpd_irq_event(). * * This function must be called from process context with no mode * setting locks held. * * Note that a connector can be both polled and probed from the hotplug * handler, in case the hotplug interrupt is known to be unreliable. * * Returns: * A boolean indicating whether the connector status changed or not */ bool drm_connector_helper_hpd_irq_event(struct drm_connector *connector) { struct drm_device *dev = connector->dev; bool changed; mutex_lock(&dev->mode_config.mutex); changed = check_connector_changed(connector); mutex_unlock(&dev->mode_config.mutex); if (changed) { drm_kms_helper_connector_hotplug_event(connector); drm_dbg_kms(dev, "[CONNECTOR:%d:%s] Sent hotplug event\n", connector->base.id, connector->name); } return changed; } EXPORT_SYMBOL(drm_connector_helper_hpd_irq_event); /** * drm_helper_hpd_irq_event - hotplug processing * @dev: drm_device * * Drivers can use this helper function to run a detect cycle on all connectors * which have the DRM_CONNECTOR_POLL_HPD flag set in their &polled member. All * other connectors are ignored, which is useful to avoid reprobing fixed * panels. * * This helper function is useful for drivers which can't or don't track hotplug * interrupts for each connector. * * Drivers which support hotplug interrupts for each connector individually and * which have a more fine-grained detect logic can use * drm_connector_helper_hpd_irq_event(). Alternatively, they should bypass this * code and directly call drm_kms_helper_hotplug_event() in case the connector * state changed. * * This function must be called from process context with no mode * setting locks held. * * Note that a connector can be both polled and probed from the hotplug handler, * in case the hotplug interrupt is known to be unreliable. * * Returns: * A boolean indicating whether the connector status changed or not */ bool drm_helper_hpd_irq_event(struct drm_device *dev) { struct drm_connector *connector, *first_changed_connector = NULL; struct drm_connector_list_iter conn_iter; int changed = 0; if (!dev->mode_config.poll_enabled) return false; mutex_lock(&dev->mode_config.mutex); drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { /* Only handle HPD capable connectors. */ if (!(connector->polled & DRM_CONNECTOR_POLL_HPD)) continue; if (check_connector_changed(connector)) { if (!first_changed_connector) { drm_connector_get(connector); first_changed_connector = connector; } changed++; } } drm_connector_list_iter_end(&conn_iter); mutex_unlock(&dev->mode_config.mutex); if (changed == 1) drm_kms_helper_connector_hotplug_event(first_changed_connector); else if (changed > 0) drm_kms_helper_hotplug_event(dev); if (first_changed_connector) drm_connector_put(first_changed_connector); return changed; } EXPORT_SYMBOL(drm_helper_hpd_irq_event); /** * drm_crtc_helper_mode_valid_fixed - Validates a display mode * @crtc: the crtc * @mode: the mode to validate * @fixed_mode: the display hardware's mode * * Returns: * MODE_OK on success, or another mode-status code otherwise. */ enum drm_mode_status drm_crtc_helper_mode_valid_fixed(struct drm_crtc *crtc, const struct drm_display_mode *mode, const struct drm_display_mode *fixed_mode) { if (mode->hdisplay != fixed_mode->hdisplay && mode->vdisplay != fixed_mode->vdisplay) return MODE_ONE_SIZE; else if (mode->hdisplay != fixed_mode->hdisplay) return MODE_ONE_WIDTH; else if (mode->vdisplay != fixed_mode->vdisplay) return MODE_ONE_HEIGHT; return MODE_OK; } EXPORT_SYMBOL(drm_crtc_helper_mode_valid_fixed); /** * drm_connector_helper_get_modes_fixed - Duplicates a display mode for a connector * @connector: the connector * @fixed_mode: the display hardware's mode * * This function duplicates a display modes for a connector. Drivers for hardware * that only supports a single fixed mode can use this function in their connector's * get_modes helper. * * Returns: * The number of created modes. */ int drm_connector_helper_get_modes_fixed(struct drm_connector *connector, const struct drm_display_mode *fixed_mode) { struct drm_device *dev = connector->dev; struct drm_display_mode *mode; mode = drm_mode_duplicate(dev, fixed_mode); if (!mode) { drm_err(dev, "Failed to duplicate mode " DRM_MODE_FMT "\n", DRM_MODE_ARG(fixed_mode)); return 0; } if (mode->name[0] == '\0') drm_mode_set_name(mode); mode->type |= DRM_MODE_TYPE_PREFERRED; drm_mode_probed_add(connector, mode); if (mode->width_mm) connector->display_info.width_mm = mode->width_mm; if (mode->height_mm) connector->display_info.height_mm = mode->height_mm; return 1; } EXPORT_SYMBOL(drm_connector_helper_get_modes_fixed); /** * drm_connector_helper_get_modes - Read EDID and update connector. * @connector: The connector * * Read the EDID using drm_edid_read() (which requires that connector->ddc is * set), and update the connector using the EDID. * * This can be used as the "default" connector helper .get_modes() hook if the * driver does not need any special processing. This is sets the example what * custom .get_modes() hooks should do regarding EDID read and connector update. * * Returns: Number of modes. */ int drm_connector_helper_get_modes(struct drm_connector *connector) { const struct drm_edid *drm_edid; int count; drm_edid = drm_edid_read(connector); /* * Unconditionally update the connector. If the EDID was read * successfully, fill in the connector information derived from the * EDID. Otherwise, if the EDID is NULL, clear the connector * information. */ drm_edid_connector_update(connector, drm_edid); count = drm_edid_connector_add_modes(connector); drm_edid_free(drm_edid); return count; } EXPORT_SYMBOL(drm_connector_helper_get_modes); /** * drm_connector_helper_tv_get_modes - Fills the modes availables to a TV connector * @connector: The connector * * Fills the available modes for a TV connector based on the supported * TV modes, and the default mode expressed by the kernel command line. * * This can be used as the default TV connector helper .get_modes() hook * if the driver does not need any special processing. * * Returns: * The number of modes added to the connector. */ int drm_connector_helper_tv_get_modes(struct drm_connector *connector) { struct drm_device *dev = connector->dev; struct drm_property *tv_mode_property = dev->mode_config.tv_mode_property; struct drm_cmdline_mode *cmdline = &connector->cmdline_mode; unsigned int ntsc_modes = BIT(DRM_MODE_TV_MODE_NTSC) | BIT(DRM_MODE_TV_MODE_NTSC_443) | BIT(DRM_MODE_TV_MODE_NTSC_J) | BIT(DRM_MODE_TV_MODE_PAL_M); unsigned int pal_modes = BIT(DRM_MODE_TV_MODE_PAL) | BIT(DRM_MODE_TV_MODE_PAL_N) | BIT(DRM_MODE_TV_MODE_SECAM); unsigned int tv_modes[2] = { UINT_MAX, UINT_MAX }; unsigned int i, supported_tv_modes = 0; if (!tv_mode_property) return 0; for (i = 0; i < tv_mode_property->num_values; i++) supported_tv_modes |= BIT(tv_mode_property->values[i]); if (((supported_tv_modes & ntsc_modes) && (supported_tv_modes & pal_modes)) || (supported_tv_modes & BIT(DRM_MODE_TV_MODE_MONOCHROME))) { uint64_t default_mode; if (drm_object_property_get_default_value(&connector->base, tv_mode_property, &default_mode)) return 0; if (cmdline->tv_mode_specified) default_mode = cmdline->tv_mode; if (BIT(default_mode) & ntsc_modes) { tv_modes[0] = DRM_MODE_TV_MODE_NTSC; tv_modes[1] = DRM_MODE_TV_MODE_PAL; } else { tv_modes[0] = DRM_MODE_TV_MODE_PAL; tv_modes[1] = DRM_MODE_TV_MODE_NTSC; } } else if (supported_tv_modes & ntsc_modes) { tv_modes[0] = DRM_MODE_TV_MODE_NTSC; } else if (supported_tv_modes & pal_modes) { tv_modes[0] = DRM_MODE_TV_MODE_PAL; } else { return 0; } for (i = 0; i < ARRAY_SIZE(tv_modes); i++) { struct drm_display_mode *mode; if (tv_modes[i] == DRM_MODE_TV_MODE_NTSC) mode = drm_mode_analog_ntsc_480i(dev); else if (tv_modes[i] == DRM_MODE_TV_MODE_PAL) mode = drm_mode_analog_pal_576i(dev); else break; if (!mode) return i; if (!i) mode->type |= DRM_MODE_TYPE_PREFERRED; drm_mode_probed_add(connector, mode); } return i; } EXPORT_SYMBOL(drm_connector_helper_tv_get_modes); /** * drm_connector_helper_detect_from_ddc - Read EDID and detect connector status. * @connector: The connector * @ctx: Acquire context * @force: Perform screen-destructive operations, if necessary * * Detects the connector status by reading the EDID using drm_probe_ddc(), * which requires connector->ddc to be set. Returns connector_status_connected * on success or connector_status_disconnected on failure. * * Returns: * The connector status as defined by enum drm_connector_status. */ int drm_connector_helper_detect_from_ddc(struct drm_connector *connector, struct drm_modeset_acquire_ctx *ctx, bool force) { struct i2c_adapter *ddc = connector->ddc; if (!ddc) return connector_status_unknown; if (drm_probe_ddc(ddc)) return connector_status_connected; return connector_status_disconnected; } EXPORT_SYMBOL(drm_connector_helper_detect_from_ddc);
11 1 1 1 1 1 1 1 1 5 4 5 1 1 1 1 1 1 1 1 1 1 8 4 4 4 5 5 5 5 1 4 4 4 4 4 1 5 44 44 44 43 44 44 44 41 44 49 2 2 2 2 1 2 2 1 12 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 9 9 9 9 9 2 2 2 2 2 34 34 34 34 6 4 1 6 4 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 7 4 3 7 4 1 2 2 2 2 1 1 2 2 2 1 2 2 2 2 2 2 2 33 33 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 // SPDX-License-Identifier: GPL-2.0-only /* * drivers/net/veth.c * * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc * * Author: Pavel Emelianov <xemul@openvz.org> * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> * */ #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/ethtool.h> #include <linux/etherdevice.h> #include <linux/u64_stats_sync.h> #include <net/rtnetlink.h> #include <net/dst.h> #include <net/netdev_lock.h> #include <net/xfrm.h> #include <net/xdp.h> #include <linux/veth.h> #include <linux/module.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/ptr_ring.h> #include <linux/bpf_trace.h> #include <linux/net_tstamp.h> #include <linux/skbuff_ref.h> #include <net/page_pool/helpers.h> #define DRV_NAME "veth" #define DRV_VERSION "1.0" #define VETH_XDP_FLAG BIT(0) #define VETH_RING_SIZE 256 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) #define VETH_XDP_TX_BULK_SIZE 16 #define VETH_XDP_BATCH 16 struct veth_stats { u64 rx_drops; /* xdp */ u64 xdp_packets; u64 xdp_bytes; u64 xdp_redirect; u64 xdp_drops; u64 xdp_tx; u64 xdp_tx_err; u64 peer_tq_xdp_xmit; u64 peer_tq_xdp_xmit_err; }; struct veth_rq_stats { struct veth_stats vs; struct u64_stats_sync syncp; }; struct veth_rq { struct napi_struct xdp_napi; struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ struct net_device *dev; struct bpf_prog __rcu *xdp_prog; struct xdp_mem_info xdp_mem; struct veth_rq_stats stats; bool rx_notify_masked; struct ptr_ring xdp_ring; struct xdp_rxq_info xdp_rxq; struct page_pool *page_pool; }; struct veth_priv { struct net_device __rcu *peer; atomic64_t dropped; struct bpf_prog *_xdp_prog; struct veth_rq *rq; unsigned int requested_headroom; }; struct veth_xdp_tx_bq { struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; unsigned int count; }; /* * ethtool interface */ struct veth_q_stat_desc { char desc[ETH_GSTRING_LEN]; size_t offset; }; #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) static const struct veth_q_stat_desc veth_rq_stats_desc[] = { { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, { "drops", VETH_RQ_STAT(rx_drops) }, { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, }; #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) static const struct veth_q_stat_desc veth_tq_stats_desc[] = { { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, }; #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) static struct { const char string[ETH_GSTRING_LEN]; } ethtool_stats_keys[] = { { "peer_ifindex" }, }; struct veth_xdp_buff { struct xdp_buff xdp; struct sk_buff *skb; }; static int veth_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { cmd->base.speed = SPEED_10000; cmd->base.duplex = DUPLEX_FULL; cmd->base.port = PORT_TP; cmd->base.autoneg = AUTONEG_DISABLE; return 0; } static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); } static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) { u8 *p = buf; int i, j; switch(stringset) { case ETH_SS_STATS: memcpy(p, &ethtool_stats_keys, sizeof(ethtool_stats_keys)); p += sizeof(ethtool_stats_keys); for (i = 0; i < dev->real_num_rx_queues; i++) for (j = 0; j < VETH_RQ_STATS_LEN; j++) ethtool_sprintf(&p, "rx_queue_%u_%.18s", i, veth_rq_stats_desc[j].desc); for (i = 0; i < dev->real_num_tx_queues; i++) for (j = 0; j < VETH_TQ_STATS_LEN; j++) ethtool_sprintf(&p, "tx_queue_%u_%.18s", i, veth_tq_stats_desc[j].desc); page_pool_ethtool_stats_get_strings(p); break; } } static int veth_get_sset_count(struct net_device *dev, int sset) { switch (sset) { case ETH_SS_STATS: return ARRAY_SIZE(ethtool_stats_keys) + VETH_RQ_STATS_LEN * dev->real_num_rx_queues + VETH_TQ_STATS_LEN * dev->real_num_tx_queues + page_pool_ethtool_stats_get_count(); default: return -EOPNOTSUPP; } } static void veth_get_page_pool_stats(struct net_device *dev, u64 *data) { #ifdef CONFIG_PAGE_POOL_STATS struct veth_priv *priv = netdev_priv(dev); struct page_pool_stats pp_stats = {}; int i; for (i = 0; i < dev->real_num_rx_queues; i++) { if (!priv->rq[i].page_pool) continue; page_pool_get_stats(priv->rq[i].page_pool, &pp_stats); } page_pool_ethtool_stats_get(data, &pp_stats); #endif /* CONFIG_PAGE_POOL_STATS */ } static void veth_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); int i, j, idx, pp_idx; data[0] = peer ? peer->ifindex : 0; idx = 1; for (i = 0; i < dev->real_num_rx_queues; i++) { const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; const void *stats_base = (void *)&rq_stats->vs; unsigned int start; size_t offset; do { start = u64_stats_fetch_begin(&rq_stats->syncp); for (j = 0; j < VETH_RQ_STATS_LEN; j++) { offset = veth_rq_stats_desc[j].offset; data[idx + j] = *(u64 *)(stats_base + offset); } } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); idx += VETH_RQ_STATS_LEN; } pp_idx = idx; if (!peer) goto page_pool_stats; rcv_priv = netdev_priv(peer); for (i = 0; i < peer->real_num_rx_queues; i++) { const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; const void *base = (void *)&rq_stats->vs; unsigned int start, tx_idx = idx; size_t offset; tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; do { start = u64_stats_fetch_begin(&rq_stats->syncp); for (j = 0; j < VETH_TQ_STATS_LEN; j++) { offset = veth_tq_stats_desc[j].offset; data[tx_idx + j] += *(u64 *)(base + offset); } } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); } pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN; page_pool_stats: veth_get_page_pool_stats(dev, &data[pp_idx]); } static void veth_get_channels(struct net_device *dev, struct ethtool_channels *channels) { channels->tx_count = dev->real_num_tx_queues; channels->rx_count = dev->real_num_rx_queues; channels->max_tx = dev->num_tx_queues; channels->max_rx = dev->num_rx_queues; } static int veth_set_channels(struct net_device *dev, struct ethtool_channels *ch); static const struct ethtool_ops veth_ethtool_ops = { .get_drvinfo = veth_get_drvinfo, .get_link = ethtool_op_get_link, .get_strings = veth_get_strings, .get_sset_count = veth_get_sset_count, .get_ethtool_stats = veth_get_ethtool_stats, .get_link_ksettings = veth_get_link_ksettings, .get_ts_info = ethtool_op_get_ts_info, .get_channels = veth_get_channels, .set_channels = veth_set_channels, }; /* general routines */ static bool veth_is_xdp_frame(void *ptr) { return (unsigned long)ptr & VETH_XDP_FLAG; } static struct xdp_frame *veth_ptr_to_xdp(void *ptr) { return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); } static void *veth_xdp_to_ptr(struct xdp_frame *xdp) { return (void *)((unsigned long)xdp | VETH_XDP_FLAG); } static void veth_ptr_free(void *ptr) { if (veth_is_xdp_frame(ptr)) xdp_return_frame(veth_ptr_to_xdp(ptr)); else kfree_skb(ptr); } static void __veth_xdp_flush(struct veth_rq *rq) { /* Write ptr_ring before reading rx_notify_masked */ smp_mb(); if (!READ_ONCE(rq->rx_notify_masked) && napi_schedule_prep(&rq->xdp_napi)) { WRITE_ONCE(rq->rx_notify_masked, true); __napi_schedule(&rq->xdp_napi); } } static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) { if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) return NETDEV_TX_BUSY; /* signal qdisc layer */ return NET_RX_SUCCESS; /* same as NETDEV_TX_OK */ } static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, struct veth_rq *rq, bool xdp) { return __dev_forward_skb(dev, skb) ?: xdp ? veth_xdp_rx(rq, skb) : __netif_rx(skb); } /* return true if the specified skb has chances of GRO aggregation * Don't strive for accuracy, but try to avoid GRO overhead in the most * common scenarios. * When XDP is enabled, all traffic is considered eligible, as the xmit * device has TSO off. * When TSO is enabled on the xmit device, we are likely interested only * in UDP aggregation, explicitly check for that if the skb is suspected * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - * to belong to locally generated UDP traffic. */ static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, const struct net_device *rcv, const struct sk_buff *skb) { return !(dev->features & NETIF_F_ALL_TSO) || (skb->destructor == sock_wfree && rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); } static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); struct veth_rq *rq = NULL; struct netdev_queue *txq; struct net_device *rcv; int length = skb->len; bool use_napi = false; int ret, rxq; rcu_read_lock(); rcv = rcu_dereference(priv->peer); if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { kfree_skb(skb); goto drop; } rcv_priv = netdev_priv(rcv); rxq = skb_get_queue_mapping(skb); if (rxq < rcv->real_num_rx_queues) { rq = &rcv_priv->rq[rxq]; /* The napi pointer is available when an XDP program is * attached or when GRO is enabled * Don't bother with napi/GRO if the skb can't be aggregated */ use_napi = rcu_access_pointer(rq->napi) && veth_skb_is_eligible_for_gro(dev, rcv, skb); } skb_tx_timestamp(skb); ret = veth_forward_skb(rcv, skb, rq, use_napi); switch (ret) { case NET_RX_SUCCESS: /* same as NETDEV_TX_OK */ if (!use_napi) dev_sw_netstats_tx_add(dev, 1, length); else __veth_xdp_flush(rq); break; case NETDEV_TX_BUSY: /* If a qdisc is attached to our virtual device, returning * NETDEV_TX_BUSY is allowed. */ txq = netdev_get_tx_queue(dev, rxq); if (qdisc_txq_has_no_queue(txq)) { dev_kfree_skb_any(skb); goto drop; } /* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */ __skb_push(skb, ETH_HLEN); netif_tx_stop_queue(txq); /* Makes sure NAPI peer consumer runs. Consumer is responsible * for starting txq again, until then ndo_start_xmit (this * function) will not be invoked by the netstack again. */ __veth_xdp_flush(rq); break; case NET_RX_DROP: /* same as NET_XMIT_DROP */ drop: atomic64_inc(&priv->dropped); ret = NET_XMIT_DROP; break; default: net_crit_ratelimited("%s(%s): Invalid return code(%d)", __func__, dev->name, ret); } rcu_read_unlock(); return ret; } static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); int i; result->peer_tq_xdp_xmit_err = 0; result->xdp_packets = 0; result->xdp_tx_err = 0; result->xdp_bytes = 0; result->rx_drops = 0; for (i = 0; i < dev->num_rx_queues; i++) { u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; struct veth_rq_stats *stats = &priv->rq[i].stats; unsigned int start; do { start = u64_stats_fetch_begin(&stats->syncp); peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; xdp_tx_err = stats->vs.xdp_tx_err; packets = stats->vs.xdp_packets; bytes = stats->vs.xdp_bytes; drops = stats->vs.rx_drops; } while (u64_stats_fetch_retry(&stats->syncp, start)); result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; result->xdp_tx_err += xdp_tx_err; result->xdp_packets += packets; result->xdp_bytes += bytes; result->rx_drops += drops; } } static void veth_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *tot) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; struct veth_stats rx; tot->tx_dropped = atomic64_read(&priv->dropped); dev_fetch_sw_netstats(tot, dev->tstats); veth_stats_rx(&rx, dev); tot->tx_dropped += rx.xdp_tx_err; tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; tot->rx_bytes += rx.xdp_bytes; tot->rx_packets += rx.xdp_packets; rcu_read_lock(); peer = rcu_dereference(priv->peer); if (peer) { struct rtnl_link_stats64 tot_peer = {}; dev_fetch_sw_netstats(&tot_peer, peer->tstats); tot->rx_bytes += tot_peer.tx_bytes; tot->rx_packets += tot_peer.tx_packets; veth_stats_rx(&rx, peer); tot->tx_dropped += rx.peer_tq_xdp_xmit_err; tot->rx_dropped += rx.xdp_tx_err; tot->tx_bytes += rx.xdp_bytes; tot->tx_packets += rx.xdp_packets; } rcu_read_unlock(); } /* fake multicast ability */ static void veth_set_multicast_list(struct net_device *dev) { } static int veth_select_rxq(struct net_device *dev) { return smp_processor_id() % dev->real_num_rx_queues; } static struct net_device *veth_peer_dev(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); /* Callers must be under RCU read side. */ return rcu_dereference(priv->peer); } static int veth_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags, bool ndo_xmit) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); int i, ret = -ENXIO, nxmit = 0; struct net_device *rcv; unsigned int max_len; struct veth_rq *rq; if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; rcu_read_lock(); rcv = rcu_dereference(priv->peer); if (unlikely(!rcv)) goto out; rcv_priv = netdev_priv(rcv); rq = &rcv_priv->rq[veth_select_rxq(rcv)]; /* The napi pointer is set if NAPI is enabled, which ensures that * xdp_ring is initialized on receive side and the peer device is up. */ if (!rcu_access_pointer(rq->napi)) goto out; max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; spin_lock(&rq->xdp_ring.producer_lock); for (i = 0; i < n; i++) { struct xdp_frame *frame = frames[i]; void *ptr = veth_xdp_to_ptr(frame); if (unlikely(xdp_get_frame_len(frame) > max_len || __ptr_ring_produce(&rq->xdp_ring, ptr))) break; nxmit++; } spin_unlock(&rq->xdp_ring.producer_lock); if (flags & XDP_XMIT_FLUSH) __veth_xdp_flush(rq); ret = nxmit; if (ndo_xmit) { u64_stats_update_begin(&rq->stats.syncp); rq->stats.vs.peer_tq_xdp_xmit += nxmit; rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; u64_stats_update_end(&rq->stats.syncp); } out: rcu_read_unlock(); return ret; } static int veth_ndo_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { int err; err = veth_xdp_xmit(dev, n, frames, flags, true); if (err < 0) { struct veth_priv *priv = netdev_priv(dev); atomic64_add(n, &priv->dropped); } return err; } static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) { int sent, i, err = 0, drops; sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); if (sent < 0) { err = sent; sent = 0; } for (i = sent; unlikely(i < bq->count); i++) xdp_return_frame(bq->q[i]); drops = bq->count - sent; trace_xdp_bulk_tx(rq->dev, sent, drops, err); u64_stats_update_begin(&rq->stats.syncp); rq->stats.vs.xdp_tx += sent; rq->stats.vs.xdp_tx_err += drops; u64_stats_update_end(&rq->stats.syncp); bq->count = 0; } static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) { struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); struct net_device *rcv; struct veth_rq *rcv_rq; rcu_read_lock(); veth_xdp_flush_bq(rq, bq); rcv = rcu_dereference(priv->peer); if (unlikely(!rcv)) goto out; rcv_priv = netdev_priv(rcv); rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; /* xdp_ring is initialized on receive side? */ if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) goto out; __veth_xdp_flush(rcv_rq); out: rcu_read_unlock(); } static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, struct veth_xdp_tx_bq *bq) { struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); if (unlikely(!frame)) return -EOVERFLOW; if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) veth_xdp_flush_bq(rq, bq); bq->q[bq->count++] = frame; return 0; } static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, struct xdp_frame *frame, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { struct xdp_frame orig_frame; struct bpf_prog *xdp_prog; rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (likely(xdp_prog)) { struct veth_xdp_buff vxbuf; struct xdp_buff *xdp = &vxbuf.xdp; u32 act; xdp_convert_frame_to_buff(frame, xdp); xdp->rxq = &rq->xdp_rxq; vxbuf.skb = NULL; act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: if (xdp_update_frame_from_buff(xdp, frame)) goto err_xdp; break; case XDP_TX: orig_frame = *frame; xdp->rxq->mem.type = frame->mem_type; if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); frame = &orig_frame; stats->rx_drops++; goto err_xdp; } stats->xdp_tx++; rcu_read_unlock(); goto xdp_xmit; case XDP_REDIRECT: orig_frame = *frame; xdp->rxq->mem.type = frame->mem_type; if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { frame = &orig_frame; stats->rx_drops++; goto err_xdp; } stats->xdp_redirect++; rcu_read_unlock(); goto xdp_xmit; default: bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(rq->dev, xdp_prog, act); fallthrough; case XDP_DROP: stats->xdp_drops++; goto err_xdp; } } rcu_read_unlock(); return frame; err_xdp: rcu_read_unlock(); xdp_return_frame(frame); xdp_xmit: return NULL; } /* frames array contains VETH_XDP_BATCH at most */ static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, int n_xdpf, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { void *skbs[VETH_XDP_BATCH]; int i; if (unlikely(!napi_skb_cache_get_bulk(skbs, n_xdpf))) { for (i = 0; i < n_xdpf; i++) xdp_return_frame(frames[i]); stats->rx_drops += n_xdpf; return; } for (i = 0; i < n_xdpf; i++) { struct sk_buff *skb = skbs[i]; skb = __xdp_build_skb_from_frame(frames[i], skb, rq->dev); if (!skb) { xdp_return_frame(frames[i]); stats->rx_drops++; continue; } napi_gro_receive(&rq->xdp_napi, skb); } } static void veth_xdp_get(struct xdp_buff *xdp) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); int i; get_page(virt_to_page(xdp->data)); if (likely(!xdp_buff_has_frags(xdp))) return; for (i = 0; i < sinfo->nr_frags; i++) __skb_frag_ref(&sinfo->frags[i]); } static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, struct xdp_buff *xdp, struct sk_buff **pskb) { struct sk_buff *skb = *pskb; u32 frame_sz; if (skb_shared(skb) || skb_head_is_locked(skb) || skb_shinfo(skb)->nr_frags || skb_headroom(skb) < XDP_PACKET_HEADROOM) { if (skb_pp_cow_data(rq->page_pool, pskb, XDP_PACKET_HEADROOM)) goto drop; skb = *pskb; } /* SKB "head" area always have tailroom for skb_shared_info */ frame_sz = skb_end_pointer(skb) - skb->head; frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), skb_headlen(skb), true); if (skb_is_nonlinear(skb)) { skb_shinfo(skb)->xdp_frags_size = skb->data_len; xdp_buff_set_frags_flag(xdp); } else { xdp_buff_clear_frags_flag(xdp); } *pskb = skb; return 0; drop: consume_skb(skb); *pskb = NULL; return -ENOMEM; } static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { void *orig_data, *orig_data_end; struct bpf_prog *xdp_prog; struct veth_xdp_buff vxbuf; struct xdp_buff *xdp = &vxbuf.xdp; u32 act, metalen; int off; skb_prepare_for_gro(skb); rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (unlikely(!xdp_prog)) { rcu_read_unlock(); goto out; } __skb_push(skb, skb->data - skb_mac_header(skb)); if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb)) goto drop; vxbuf.skb = skb; orig_data = xdp->data; orig_data_end = xdp->data_end; act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: break; case XDP_TX: veth_xdp_get(xdp); consume_skb(skb); xdp->rxq->mem = rq->xdp_mem; if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); stats->rx_drops++; goto err_xdp; } stats->xdp_tx++; rcu_read_unlock(); goto xdp_xmit; case XDP_REDIRECT: veth_xdp_get(xdp); consume_skb(skb); xdp->rxq->mem = rq->xdp_mem; if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { stats->rx_drops++; goto err_xdp; } stats->xdp_redirect++; rcu_read_unlock(); goto xdp_xmit; default: bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(rq->dev, xdp_prog, act); fallthrough; case XDP_DROP: stats->xdp_drops++; goto xdp_drop; } rcu_read_unlock(); /* check if bpf_xdp_adjust_head was used */ off = orig_data - xdp->data; if (off > 0) __skb_push(skb, off); else if (off < 0) __skb_pull(skb, -off); skb_reset_mac_header(skb); /* check if bpf_xdp_adjust_tail was used */ off = xdp->data_end - orig_data_end; if (off != 0) __skb_put(skb, off); /* positive on grow, negative on shrink */ /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. */ if (xdp_buff_has_frags(xdp)) skb->data_len = skb_shinfo(skb)->xdp_frags_size; else skb->data_len = 0; skb->protocol = eth_type_trans(skb, rq->dev); metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); out: return skb; drop: stats->rx_drops++; xdp_drop: rcu_read_unlock(); kfree_skb(skb); return NULL; err_xdp: rcu_read_unlock(); xdp_return_buff(xdp); xdp_xmit: return NULL; } static int veth_xdp_rcv(struct veth_rq *rq, int budget, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { int i, done = 0, n_xdpf = 0; void *xdpf[VETH_XDP_BATCH]; for (i = 0; i < budget; i++) { void *ptr = __ptr_ring_consume(&rq->xdp_ring); if (!ptr) break; if (veth_is_xdp_frame(ptr)) { /* ndo_xdp_xmit */ struct xdp_frame *frame = veth_ptr_to_xdp(ptr); stats->xdp_bytes += xdp_get_frame_len(frame); frame = veth_xdp_rcv_one(rq, frame, bq, stats); if (frame) { /* XDP_PASS */ xdpf[n_xdpf++] = frame; if (n_xdpf == VETH_XDP_BATCH) { veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); n_xdpf = 0; } } } else { /* ndo_start_xmit */ struct sk_buff *skb = ptr; stats->xdp_bytes += skb->len; skb = veth_xdp_rcv_skb(rq, skb, bq, stats); if (skb) { if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) netif_receive_skb(skb); else napi_gro_receive(&rq->xdp_napi, skb); } } done++; } if (n_xdpf) veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); u64_stats_update_begin(&rq->stats.syncp); rq->stats.vs.xdp_redirect += stats->xdp_redirect; rq->stats.vs.xdp_bytes += stats->xdp_bytes; rq->stats.vs.xdp_drops += stats->xdp_drops; rq->stats.vs.rx_drops += stats->rx_drops; rq->stats.vs.xdp_packets += done; u64_stats_update_end(&rq->stats.syncp); return done; } static int veth_poll(struct napi_struct *napi, int budget) { struct veth_rq *rq = container_of(napi, struct veth_rq, xdp_napi); struct veth_priv *priv = netdev_priv(rq->dev); int queue_idx = rq->xdp_rxq.queue_index; struct netdev_queue *peer_txq; struct veth_stats stats = {}; struct net_device *peer_dev; struct veth_xdp_tx_bq bq; int done; bq.count = 0; /* NAPI functions as RCU section */ peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL; xdp_set_return_frame_no_direct(); done = veth_xdp_rcv(rq, budget, &bq, &stats); if (stats.xdp_redirect > 0) xdp_do_flush(); if (stats.xdp_tx > 0) veth_xdp_flush(rq, &bq); xdp_clear_return_frame_no_direct(); if (done < budget && napi_complete_done(napi, done)) { /* Write rx_notify_masked before reading ptr_ring */ smp_store_mb(rq->rx_notify_masked, false); if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { if (napi_schedule_prep(&rq->xdp_napi)) { WRITE_ONCE(rq->rx_notify_masked, true); __napi_schedule(&rq->xdp_napi); } } } /* Release backpressure per NAPI poll */ smp_rmb(); /* Paired with netif_tx_stop_queue set_bit */ if (peer_txq && netif_tx_queue_stopped(peer_txq)) { txq_trans_cond_update(peer_txq); netif_tx_wake_queue(peer_txq); } return done; } static int veth_create_page_pool(struct veth_rq *rq) { struct page_pool_params pp_params = { .order = 0, .pool_size = VETH_RING_SIZE, .nid = NUMA_NO_NODE, .dev = &rq->dev->dev, }; rq->page_pool = page_pool_create(&pp_params); if (IS_ERR(rq->page_pool)) { int err = PTR_ERR(rq->page_pool); rq->page_pool = NULL; return err; } return 0; } static int __veth_napi_enable_range(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int err, i; for (i = start; i < end; i++) { err = veth_create_page_pool(&priv->rq[i]); if (err) goto err_page_pool; } for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); if (err) goto err_xdp_ring; } for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; napi_enable(&rq->xdp_napi); rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); } return 0; err_xdp_ring: for (i--; i >= start; i--) ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); i = end; err_page_pool: for (i--; i >= start; i--) { page_pool_destroy(priv->rq[i].page_pool); priv->rq[i].page_pool = NULL; } return err; } static int __veth_napi_enable(struct net_device *dev) { return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); } static void veth_napi_del_range(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int i; for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; rcu_assign_pointer(priv->rq[i].napi, NULL); napi_disable(&rq->xdp_napi); __netif_napi_del(&rq->xdp_napi); } synchronize_net(); for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; rq->rx_notify_masked = false; ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); } for (i = start; i < end; i++) { page_pool_destroy(priv->rq[i].page_pool); priv->rq[i].page_pool = NULL; } } static void veth_napi_del(struct net_device *dev) { veth_napi_del_range(dev, 0, dev->real_num_rx_queues); } static bool veth_gro_requested(const struct net_device *dev) { return !!(dev->wanted_features & NETIF_F_GRO); } static int veth_enable_xdp_range(struct net_device *dev, int start, int end, bool napi_already_on) { struct veth_priv *priv = netdev_priv(dev); int err, i; for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; if (!napi_already_on) netif_napi_add(dev, &rq->xdp_napi, veth_poll); err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); if (err < 0) goto err_rxq_reg; err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) goto err_reg_mem; /* Save original mem info as it can be overwritten */ rq->xdp_mem = rq->xdp_rxq.mem; } return 0; err_reg_mem: xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); err_rxq_reg: for (i--; i >= start; i--) { struct veth_rq *rq = &priv->rq[i]; xdp_rxq_info_unreg(&rq->xdp_rxq); if (!napi_already_on) netif_napi_del(&rq->xdp_napi); } return err; } static void veth_disable_xdp_range(struct net_device *dev, int start, int end, bool delete_napi) { struct veth_priv *priv = netdev_priv(dev); int i; for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; rq->xdp_rxq.mem = rq->xdp_mem; xdp_rxq_info_unreg(&rq->xdp_rxq); if (delete_napi) netif_napi_del(&rq->xdp_napi); } } static int veth_enable_xdp(struct net_device *dev) { bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); struct veth_priv *priv = netdev_priv(dev); int err, i; if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); if (err) return err; if (!napi_already_on) { err = __veth_napi_enable(dev); if (err) { veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); return err; } } } for (i = 0; i < dev->real_num_rx_queues; i++) { rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); } return 0; } static void veth_disable_xdp(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); int i; for (i = 0; i < dev->real_num_rx_queues; i++) rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); if (!netif_running(dev) || !veth_gro_requested(dev)) veth_napi_del(dev); veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); } static int veth_napi_enable_range(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int err, i; for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; netif_napi_add(dev, &rq->xdp_napi, veth_poll); } err = __veth_napi_enable_range(dev, start, end); if (err) { for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; netif_napi_del(&rq->xdp_napi); } return err; } return err; } static int veth_napi_enable(struct net_device *dev) { return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); } static void veth_disable_range_safe(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); if (start >= end) return; if (priv->_xdp_prog) { veth_napi_del_range(dev, start, end); veth_disable_xdp_range(dev, start, end, false); } else if (veth_gro_requested(dev)) { veth_napi_del_range(dev, start, end); } } static int veth_enable_range_safe(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int err; if (start >= end) return 0; if (priv->_xdp_prog) { /* these channels are freshly initialized, napi is not on there even * when GRO is requeste */ err = veth_enable_xdp_range(dev, start, end, false); if (err) return err; err = __veth_napi_enable_range(dev, start, end); if (err) { /* on error always delete the newly added napis */ veth_disable_xdp_range(dev, start, end, true); return err; } } else if (veth_gro_requested(dev)) { return veth_napi_enable_range(dev, start, end); } return 0; } static void veth_set_xdp_features(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; peer = rtnl_dereference(priv->peer); if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { struct veth_priv *priv_peer = netdev_priv(peer); xdp_features_t val = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_RX_SG; if (priv_peer->_xdp_prog || veth_gro_requested(peer)) val |= NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG; xdp_set_features_flag(dev, val); } else { xdp_clear_features_flag(dev); } } static int veth_set_channels(struct net_device *dev, struct ethtool_channels *ch) { struct veth_priv *priv = netdev_priv(dev); unsigned int old_rx_count, new_rx_count; struct veth_priv *peer_priv; struct net_device *peer; int err; /* sanity check. Upper bounds are already enforced by the caller */ if (!ch->rx_count || !ch->tx_count) return -EINVAL; /* avoid braking XDP, if that is enabled */ peer = rtnl_dereference(priv->peer); peer_priv = peer ? netdev_priv(peer) : NULL; if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) return -EINVAL; if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) return -EINVAL; old_rx_count = dev->real_num_rx_queues; new_rx_count = ch->rx_count; if (netif_running(dev)) { /* turn device off */ netif_carrier_off(dev); if (peer) netif_carrier_off(peer); /* try to allocate new resources, as needed*/ err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); if (err) goto out; } err = netif_set_real_num_rx_queues(dev, ch->rx_count); if (err) goto revert; err = netif_set_real_num_tx_queues(dev, ch->tx_count); if (err) { int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); /* this error condition could happen only if rx and tx change * in opposite directions (e.g. tx nr raises, rx nr decreases) * and we can't do anything to fully restore the original * status */ if (err2) pr_warn("Can't restore rx queues config %d -> %d %d", new_rx_count, old_rx_count, err2); else goto revert; } out: if (netif_running(dev)) { /* note that we need to swap the arguments WRT the enable part * to identify the range we have to disable */ veth_disable_range_safe(dev, new_rx_count, old_rx_count); netif_carrier_on(dev); if (peer) netif_carrier_on(peer); } /* update XDP supported features */ veth_set_xdp_features(dev); if (peer) veth_set_xdp_features(peer); return err; revert: new_rx_count = old_rx_count; old_rx_count = ch->rx_count; goto out; } static int veth_open(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); int err; if (!peer) return -ENOTCONN; if (priv->_xdp_prog) { err = veth_enable_xdp(dev); if (err) return err; } else if (veth_gro_requested(dev)) { err = veth_napi_enable(dev); if (err) return err; } if (peer->flags & IFF_UP) { netif_carrier_on(dev); netif_carrier_on(peer); } veth_set_xdp_features(dev); return 0; } static int veth_close(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); netif_carrier_off(dev); if (peer) netif_carrier_off(peer); if (priv->_xdp_prog) veth_disable_xdp(dev); else if (veth_gro_requested(dev)) veth_napi_del(dev); return 0; } static int is_valid_veth_mtu(int mtu) { return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; } static int veth_alloc_queues(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); int i; priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!priv->rq) return -ENOMEM; for (i = 0; i < dev->num_rx_queues; i++) { priv->rq[i].dev = dev; u64_stats_init(&priv->rq[i].stats.syncp); } return 0; } static void veth_free_queues(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); kvfree(priv->rq); } static int veth_dev_init(struct net_device *dev) { netdev_lockdep_set_classes(dev); return veth_alloc_queues(dev); } static void veth_dev_free(struct net_device *dev) { veth_free_queues(dev); } #ifdef CONFIG_NET_POLL_CONTROLLER static void veth_poll_controller(struct net_device *dev) { /* veth only receives frames when its peer sends one * Since it has nothing to do with disabling irqs, we are guaranteed * never to have pending data when we poll for it so * there is nothing to do here. * * We need this though so netpoll recognizes us as an interface that * supports polling, which enables bridge devices in virt setups to * still use netconsole */ } #endif /* CONFIG_NET_POLL_CONTROLLER */ static int veth_get_iflink(const struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; int iflink; rcu_read_lock(); peer = rcu_dereference(priv->peer); iflink = peer ? READ_ONCE(peer->ifindex) : 0; rcu_read_unlock(); return iflink; } static netdev_features_t veth_fix_features(struct net_device *dev, netdev_features_t features) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; peer = rtnl_dereference(priv->peer); if (peer) { struct veth_priv *peer_priv = netdev_priv(peer); if (peer_priv->_xdp_prog) features &= ~NETIF_F_GSO_SOFTWARE; } return features; } static int veth_set_features(struct net_device *dev, netdev_features_t features) { netdev_features_t changed = features ^ dev->features; struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; int err; if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) return 0; peer = rtnl_dereference(priv->peer); if (features & NETIF_F_GRO) { err = veth_napi_enable(dev); if (err) return err; if (peer) xdp_features_set_redirect_target(peer, true); } else { if (peer) xdp_features_clear_redirect_target(peer); veth_napi_del(dev); } return 0; } static void veth_set_rx_headroom(struct net_device *dev, int new_hr) { struct veth_priv *peer_priv, *priv = netdev_priv(dev); struct net_device *peer; if (new_hr < 0) new_hr = 0; rcu_read_lock(); peer = rcu_dereference(priv->peer); if (unlikely(!peer)) goto out; peer_priv = netdev_priv(peer); priv->requested_headroom = new_hr; new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); dev->needed_headroom = new_hr; peer->needed_headroom = new_hr; out: rcu_read_unlock(); } static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct veth_priv *priv = netdev_priv(dev); struct bpf_prog *old_prog; struct net_device *peer; unsigned int max_mtu; int err; old_prog = priv->_xdp_prog; priv->_xdp_prog = prog; peer = rtnl_dereference(priv->peer); if (prog) { if (!peer) { NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); err = -ENOTCONN; goto err; } max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) - peer->hard_header_len; /* Allow increasing the max_mtu if the program supports * XDP fragments. */ if (prog->aux->xdp_has_frags) max_mtu += PAGE_SIZE * MAX_SKB_FRAGS; if (peer->mtu > max_mtu) { NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); err = -ERANGE; goto err; } if (dev->real_num_rx_queues < peer->real_num_tx_queues) { NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); err = -ENOSPC; goto err; } if (dev->flags & IFF_UP) { err = veth_enable_xdp(dev); if (err) { NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); goto err; } } if (!old_prog) { peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; peer->max_mtu = max_mtu; } xdp_features_set_redirect_target(peer, true); } if (old_prog) { if (!prog) { if (peer && !veth_gro_requested(dev)) xdp_features_clear_redirect_target(peer); if (dev->flags & IFF_UP) veth_disable_xdp(dev); if (peer) { peer->hw_features |= NETIF_F_GSO_SOFTWARE; peer->max_mtu = ETH_MAX_MTU; } } bpf_prog_put(old_prog); } if ((!!old_prog ^ !!prog) && peer) netdev_update_features(peer); return 0; err: priv->_xdp_prog = old_prog; return err; } static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return veth_xdp_set(dev, xdp->prog, xdp->extack); default: return -EINVAL; } } static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) { struct veth_xdp_buff *_ctx = (void *)ctx; if (!_ctx->skb) return -ENODATA; *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp; return 0; } static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type) { struct veth_xdp_buff *_ctx = (void *)ctx; struct sk_buff *skb = _ctx->skb; if (!skb) return -ENODATA; *hash = skb_get_hash(skb); *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE; return 0; } static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto, u16 *vlan_tci) { const struct veth_xdp_buff *_ctx = (void *)ctx; const struct sk_buff *skb = _ctx->skb; int err; if (!skb) return -ENODATA; err = __vlan_hwaccel_get_tag(skb, vlan_tci); if (err) return err; *vlan_proto = skb->vlan_proto; return err; } static const struct net_device_ops veth_netdev_ops = { .ndo_init = veth_dev_init, .ndo_open = veth_open, .ndo_stop = veth_close, .ndo_start_xmit = veth_xmit, .ndo_get_stats64 = veth_get_stats64, .ndo_set_rx_mode = veth_set_multicast_list, .ndo_set_mac_address = eth_mac_addr, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = veth_poll_controller, #endif .ndo_get_iflink = veth_get_iflink, .ndo_fix_features = veth_fix_features, .ndo_set_features = veth_set_features, .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = veth_set_rx_headroom, .ndo_bpf = veth_xdp, .ndo_xdp_xmit = veth_ndo_xdp_xmit, .ndo_get_peer_dev = veth_peer_dev, }; static const struct xdp_metadata_ops veth_xdp_metadata_ops = { .xmo_rx_timestamp = veth_xdp_rx_timestamp, .xmo_rx_hash = veth_xdp_rx_hash, .xmo_rx_vlan_tag = veth_xdp_rx_vlan_tag, }; #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) static void veth_setup(struct net_device *dev) { ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_PHONY_HEADROOM; dev->priv_flags |= IFF_DISABLE_NETPOLL; dev->lltx = true; dev->netdev_ops = &veth_netdev_ops; dev->xdp_metadata_ops = &veth_xdp_metadata_ops; dev->ethtool_ops = &veth_ethtool_ops; dev->features |= VETH_FEATURES; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX); dev->needs_free_netdev = true; dev->priv_destructor = veth_dev_free; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->max_mtu = ETH_MAX_MTU; dev->hw_features = VETH_FEATURES; dev->hw_enc_features = VETH_FEATURES; dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; netif_set_tso_max_size(dev, GSO_MAX_SIZE); } /* * netlink interface */ static int veth_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } if (tb[IFLA_MTU]) { if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) return -EINVAL; } return 0; } static struct rtnl_link_ops veth_link_ops; static void veth_disable_gro(struct net_device *dev) { dev->features &= ~NETIF_F_GRO; dev->wanted_features &= ~NETIF_F_GRO; netdev_update_features(dev); } static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) { int err; if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { err = netif_set_real_num_tx_queues(dev, 1); if (err) return err; } if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { err = netif_set_real_num_rx_queues(dev, 1); if (err) return err; } return 0; } static int veth_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *peer_net = rtnl_newlink_peer_net(params); struct nlattr **data = params->data; struct nlattr **tb = params->tb; int err; struct net_device *peer; struct veth_priv *priv; char ifname[IFNAMSIZ]; struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; unsigned char name_assign_type; struct ifinfomsg *ifmp; /* * create and register peer first */ if (data && data[VETH_INFO_PEER]) { struct nlattr *nla_peer = data[VETH_INFO_PEER]; ifmp = nla_data(nla_peer); rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); tbp = peer_tb; } else { ifmp = NULL; tbp = tb; } if (ifmp && tbp[IFLA_IFNAME]) { nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); name_assign_type = NET_NAME_USER; } else { snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); name_assign_type = NET_NAME_ENUM; } peer = rtnl_create_link(peer_net, ifname, name_assign_type, &veth_link_ops, tbp, extack); if (IS_ERR(peer)) return PTR_ERR(peer); if (!ifmp || !tbp[IFLA_ADDRESS]) eth_hw_addr_random(peer); if (ifmp && (dev->ifindex != 0)) peer->ifindex = ifmp->ifi_index; netif_inherit_tso_max(peer, dev); err = register_netdevice(peer); if (err < 0) goto err_register_peer; /* keep GRO disabled by default to be consistent with the established * veth behavior */ veth_disable_gro(peer); netif_carrier_off(peer); err = rtnl_configure_link(peer, ifmp, 0, NULL); if (err < 0) goto err_configure_peer; /* * register dev last * * note, that since we've registered new device the dev's name * should be re-allocated */ if (tb[IFLA_ADDRESS] == NULL) eth_hw_addr_random(dev); if (tb[IFLA_IFNAME]) nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); else snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); err = register_netdevice(dev); if (err < 0) goto err_register_dev; netif_carrier_off(dev); /* * tie the deviced together */ priv = netdev_priv(dev); rcu_assign_pointer(priv->peer, peer); err = veth_init_queues(dev, tb); if (err) goto err_queues; priv = netdev_priv(peer); rcu_assign_pointer(priv->peer, dev); err = veth_init_queues(peer, tb); if (err) goto err_queues; veth_disable_gro(dev); /* update XDP supported features */ veth_set_xdp_features(dev); veth_set_xdp_features(peer); return 0; err_queues: unregister_netdevice(dev); err_register_dev: /* nothing to do */ err_configure_peer: unregister_netdevice(peer); return err; err_register_peer: free_netdev(peer); return err; } static void veth_dellink(struct net_device *dev, struct list_head *head) { struct veth_priv *priv; struct net_device *peer; priv = netdev_priv(dev); peer = rtnl_dereference(priv->peer); /* Note : dellink() is called from default_device_exit_batch(), * before a rcu_synchronize() point. The devices are guaranteed * not being freed before one RCU grace period. */ RCU_INIT_POINTER(priv->peer, NULL); unregister_netdevice_queue(dev, head); if (peer) { priv = netdev_priv(peer); RCU_INIT_POINTER(priv->peer, NULL); unregister_netdevice_queue(peer, head); } } static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, }; static struct net *veth_get_link_net(const struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); return peer ? dev_net(peer) : dev_net(dev); } static unsigned int veth_get_num_queues(void) { /* enforce the same queue limit as rtnl_create_link */ int queues = num_possible_cpus(); if (queues > 4096) queues = 4096; return queues; } static struct rtnl_link_ops veth_link_ops = { .kind = DRV_NAME, .priv_size = sizeof(struct veth_priv), .setup = veth_setup, .validate = veth_validate, .newlink = veth_newlink, .dellink = veth_dellink, .policy = veth_policy, .peer_type = VETH_INFO_PEER, .maxtype = VETH_INFO_MAX, .get_link_net = veth_get_link_net, .get_num_tx_queues = veth_get_num_queues, .get_num_rx_queues = veth_get_num_queues, }; /* * init/fini */ static __init int veth_init(void) { return rtnl_link_register(&veth_link_ops); } static __exit void veth_exit(void) { rtnl_link_unregister(&veth_link_ops); } module_init(veth_init); module_exit(veth_exit); MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_RTNL_LINK(DRV_NAME);
5138 5098 5138 5142 5114 2 5 5 5 5 1 5 5 5 4 4 4 4 9 9 9 9 8 7 9 8 41 1 1 1 1 2 2 2 1 1 1 2 16 26 26 19 1 5 5 1 36 6 36 14 7 6 7 36 20 5 37 39 37 38 3 2 1 1 1 2 40 20 20 19 5 20 6 8 5 1 150 149 1 1 1 5 5 4 2 1 2 2 1 1 2 2 2 2 1 8 2 1 1 6 5 1 4 3 2 1 1 1 1 93 221 224 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 // SPDX-License-Identifier: GPL-2.0-or-later /* Common capabilities, needed by capability.o. */ #include <linux/capability.h> #include <linux/audit.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/lsm_hooks.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/ptrace.h> #include <linux/xattr.h> #include <linux/hugetlb.h> #include <linux/mount.h> #include <linux/sched.h> #include <linux/prctl.h> #include <linux/securebits.h> #include <linux/user_namespace.h> #include <linux/binfmts.h> #include <linux/personality.h> #include <linux/mnt_idmapping.h> #include <uapi/linux/lsm.h> #define CREATE_TRACE_POINTS #include <trace/events/capability.h> /* * If a non-root user executes a setuid-root binary in * !secure(SECURE_NOROOT) mode, then we raise capabilities. * However if fE is also set, then the intent is for only * the file capabilities to be applied, and the setuid-root * bit is left on either to change the uid (plausible) or * to get full privilege on a kernel without file capabilities * support. So in that case we do not raise capabilities. * * Warn if that happens, once per boot. */ static void warn_setuid_and_fcaps_mixed(const char *fname) { static int warned; if (!warned) { printk(KERN_INFO "warning: `%s' has both setuid-root and" " effective capabilities. Therefore not raising all" " capabilities.\n", fname); warned = 1; } } /** * cap_capable_helper - Determine whether a task has a particular effective * capability. * @cred: The credentials to use * @target_ns: The user namespace of the resource being accessed * @cred_ns: The user namespace of the credentials * @cap: The capability to check for * * Determine whether the nominated task has the specified capability amongst * its effective set, returning 0 if it does, -ve if it does not. * * See cap_capable for more details. */ static inline int cap_capable_helper(const struct cred *cred, struct user_namespace *target_ns, const struct user_namespace *cred_ns, int cap) { struct user_namespace *ns = target_ns; /* See if cred has the capability in the target user namespace * by examining the target user namespace and all of the target * user namespace's parents. */ for (;;) { /* Do we have the necessary capabilities? */ if (likely(ns == cred_ns)) return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM; /* * If we're already at a lower level than we're looking for, * we're done searching. */ if (ns->level <= cred_ns->level) return -EPERM; /* * The owner of the user namespace in the parent of the * user namespace has all caps. */ if ((ns->parent == cred_ns) && uid_eq(ns->owner, cred->euid)) return 0; /* * If you have a capability in a parent user ns, then you have * it over all children user namespaces as well. */ ns = ns->parent; } /* We never get here */ } /** * cap_capable - Determine whether a task has a particular effective capability * @cred: The credentials to use * @target_ns: The user namespace of the resource being accessed * @cap: The capability to check for * @opts: Bitmask of options defined in include/linux/security.h (unused) * * Determine whether the nominated task has the specified capability amongst * its effective set, returning 0 if it does, -ve if it does not. * * NOTE WELL: cap_capable() has reverse semantics to the capable() call * and friends. That is cap_capable() returns an int 0 when a task has * a capability, while the kernel's capable(), has_ns_capability(), * has_ns_capability_noaudit(), and has_capability_noaudit() return a * bool true (1) for this case. */ int cap_capable(const struct cred *cred, struct user_namespace *target_ns, int cap, unsigned int opts) { const struct user_namespace *cred_ns = cred->user_ns; int ret = cap_capable_helper(cred, target_ns, cred_ns, cap); trace_cap_capable(cred, target_ns, cred_ns, cap, ret); return ret; } /** * cap_settime - Determine whether the current process may set the system clock * @ts: The time to set * @tz: The timezone to set * * Determine whether the current process may set the system clock and timezone * information, returning 0 if permission granted, -ve if denied. */ int cap_settime(const struct timespec64 *ts, const struct timezone *tz) { if (!capable(CAP_SYS_TIME)) return -EPERM; return 0; } /** * cap_ptrace_access_check - Determine whether the current process may access * another * @child: The process to be accessed * @mode: The mode of attachment. * * If we are in the same or an ancestor user_ns and have all the target * task's capabilities, then ptrace access is allowed. * If we have the ptrace capability to the target user_ns, then ptrace * access is allowed. * Else denied. * * Determine whether a process may access another, returning 0 if permission * granted, -ve if denied. */ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode) { int ret = 0; const struct cred *cred, *child_cred; const kernel_cap_t *caller_caps; rcu_read_lock(); cred = current_cred(); child_cred = __task_cred(child); if (mode & PTRACE_MODE_FSCREDS) caller_caps = &cred->cap_effective; else caller_caps = &cred->cap_permitted; if (cred->user_ns == child_cred->user_ns && cap_issubset(child_cred->cap_permitted, *caller_caps)) goto out; if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE)) goto out; ret = -EPERM; out: rcu_read_unlock(); return ret; } /** * cap_ptrace_traceme - Determine whether another process may trace the current * @parent: The task proposed to be the tracer * * If parent is in the same or an ancestor user_ns and has all current's * capabilities, then ptrace access is allowed. * If parent has the ptrace capability to current's user_ns, then ptrace * access is allowed. * Else denied. * * Determine whether the nominated task is permitted to trace the current * process, returning 0 if permission is granted, -ve if denied. */ int cap_ptrace_traceme(struct task_struct *parent) { int ret = 0; const struct cred *cred, *child_cred; rcu_read_lock(); cred = __task_cred(parent); child_cred = current_cred(); if (cred->user_ns == child_cred->user_ns && cap_issubset(child_cred->cap_permitted, cred->cap_permitted)) goto out; if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE)) goto out; ret = -EPERM; out: rcu_read_unlock(); return ret; } /** * cap_capget - Retrieve a task's capability sets * @target: The task from which to retrieve the capability sets * @effective: The place to record the effective set * @inheritable: The place to record the inheritable set * @permitted: The place to record the permitted set * * This function retrieves the capabilities of the nominated task and returns * them to the caller. */ int cap_capget(const struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { const struct cred *cred; /* Derived from kernel/capability.c:sys_capget. */ rcu_read_lock(); cred = __task_cred(target); *effective = cred->cap_effective; *inheritable = cred->cap_inheritable; *permitted = cred->cap_permitted; rcu_read_unlock(); return 0; } /* * Determine whether the inheritable capabilities are limited to the old * permitted set. Returns 1 if they are limited, 0 if they are not. */ static inline int cap_inh_is_capped(void) { /* they are so limited unless the current task has the CAP_SETPCAP * capability */ if (cap_capable(current_cred(), current_cred()->user_ns, CAP_SETPCAP, CAP_OPT_NONE) == 0) return 0; return 1; } /** * cap_capset - Validate and apply proposed changes to current's capabilities * @new: The proposed new credentials; alterations should be made here * @old: The current task's current credentials * @effective: A pointer to the proposed new effective capabilities set * @inheritable: A pointer to the proposed new inheritable capabilities set * @permitted: A pointer to the proposed new permitted capabilities set * * This function validates and applies a proposed mass change to the current * process's capability sets. The changes are made to the proposed new * credentials, and assuming no error, will be committed by the caller of LSM. */ int cap_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { if (cap_inh_is_capped() && !cap_issubset(*inheritable, cap_combine(old->cap_inheritable, old->cap_permitted))) /* incapable of using this inheritable set */ return -EPERM; if (!cap_issubset(*inheritable, cap_combine(old->cap_inheritable, old->cap_bset))) /* no new pI capabilities outside bounding set */ return -EPERM; /* verify restrictions on target's new Permitted set */ if (!cap_issubset(*permitted, old->cap_permitted)) return -EPERM; /* verify the _new_Effective_ is a subset of the _new_Permitted_ */ if (!cap_issubset(*effective, *permitted)) return -EPERM; new->cap_effective = *effective; new->cap_inheritable = *inheritable; new->cap_permitted = *permitted; /* * Mask off ambient bits that are no longer both permitted and * inheritable. */ new->cap_ambient = cap_intersect(new->cap_ambient, cap_intersect(*permitted, *inheritable)); if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EINVAL; return 0; } /** * cap_inode_need_killpriv - Determine if inode change affects privileges * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV * * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV * affects the security markings on that inode, and if it is, should * inode_killpriv() be invoked or the change rejected. * * Return: 1 if security.capability has a value, meaning inode_killpriv() * is required, 0 otherwise, meaning inode_killpriv() is not required. */ int cap_inode_need_killpriv(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); int error; error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0); return error > 0; } /** * cap_inode_killpriv - Erase the security markings on an inode * * @idmap: idmap of the mount the inode was found from * @dentry: The inode/dentry to alter * * Erase the privilege-enhancing security markings on an inode. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * Return: 0 if successful, -ve on error. */ int cap_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry) { int error; error = __vfs_removexattr(idmap, dentry, XATTR_NAME_CAPS); if (error == -EOPNOTSUPP) error = 0; return error; } /** * kuid_root_in_ns - check whether the given kuid is root in the given ns * @kuid: the kuid to be tested * @ns: the user namespace to test against * * Returns true if @kuid represents the root user in @ns, false otherwise. */ static bool kuid_root_in_ns(kuid_t kuid, struct user_namespace *ns) { for (;; ns = ns->parent) { if (from_kuid(ns, kuid) == 0) return true; if (ns == &init_user_ns) break; } return false; } static bool vfsuid_root_in_currentns(vfsuid_t vfsuid) { kuid_t kuid; if (!vfsuid_valid(vfsuid)) return false; kuid = vfsuid_into_kuid(vfsuid); return kuid_root_in_ns(kuid, current_user_ns()); } static __u32 sansflags(__u32 m) { return m & ~VFS_CAP_FLAGS_EFFECTIVE; } static bool is_v2header(int size, const struct vfs_cap_data *cap) { if (size != XATTR_CAPS_SZ_2) return false; return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2; } static bool is_v3header(int size, const struct vfs_cap_data *cap) { if (size != XATTR_CAPS_SZ_3) return false; return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3; } /* * getsecurity: We are called for security.* before any attempt to read the * xattr from the inode itself. * * This gives us a chance to read the on-disk value and convert it. If we * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler. * * Note we are not called by vfs_getxattr_alloc(), but that is only called * by the integrity subsystem, which really wants the unconverted values - * so that's good. */ int cap_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { int size; kuid_t kroot; vfsuid_t vfsroot; u32 nsmagic, magic; uid_t root, mappedroot; char *tmpbuf = NULL; struct vfs_cap_data *cap; struct vfs_ns_cap_data *nscap = NULL; struct dentry *dentry; struct user_namespace *fs_ns; if (strcmp(name, "capability") != 0) return -EOPNOTSUPP; dentry = d_find_any_alias(inode); if (!dentry) return -EINVAL; size = vfs_getxattr_alloc(idmap, dentry, XATTR_NAME_CAPS, &tmpbuf, sizeof(struct vfs_ns_cap_data), GFP_NOFS); dput(dentry); /* gcc11 complains if we don't check for !tmpbuf */ if (size < 0 || !tmpbuf) goto out_free; fs_ns = inode->i_sb->s_user_ns; cap = (struct vfs_cap_data *) tmpbuf; if (is_v2header(size, cap)) { root = 0; } else if (is_v3header(size, cap)) { nscap = (struct vfs_ns_cap_data *) tmpbuf; root = le32_to_cpu(nscap->rootid); } else { size = -EINVAL; goto out_free; } kroot = make_kuid(fs_ns, root); /* If this is an idmapped mount shift the kuid. */ vfsroot = make_vfsuid(idmap, fs_ns, kroot); /* If the root kuid maps to a valid uid in current ns, then return * this as a nscap. */ mappedroot = from_kuid(current_user_ns(), vfsuid_into_kuid(vfsroot)); if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { size = sizeof(struct vfs_ns_cap_data); if (alloc) { if (!nscap) { /* v2 -> v3 conversion */ nscap = kzalloc(size, GFP_ATOMIC); if (!nscap) { size = -ENOMEM; goto out_free; } nsmagic = VFS_CAP_REVISION_3; magic = le32_to_cpu(cap->magic_etc); if (magic & VFS_CAP_FLAGS_EFFECTIVE) nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); nscap->magic_etc = cpu_to_le32(nsmagic); } else { /* use allocated v3 buffer */ tmpbuf = NULL; } nscap->rootid = cpu_to_le32(mappedroot); *buffer = nscap; } goto out_free; } if (!vfsuid_root_in_currentns(vfsroot)) { size = -EOVERFLOW; goto out_free; } /* This comes from a parent namespace. Return as a v2 capability */ size = sizeof(struct vfs_cap_data); if (alloc) { if (nscap) { /* v3 -> v2 conversion */ cap = kzalloc(size, GFP_ATOMIC); if (!cap) { size = -ENOMEM; goto out_free; } magic = VFS_CAP_REVISION_2; nsmagic = le32_to_cpu(nscap->magic_etc); if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) magic |= VFS_CAP_FLAGS_EFFECTIVE; memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); cap->magic_etc = cpu_to_le32(magic); } else { /* use unconverted v2 */ tmpbuf = NULL; } *buffer = cap; } out_free: kfree(tmpbuf); return size; } /** * rootid_from_xattr - translate root uid of vfs caps * * @value: vfs caps value which may be modified by this function * @size: size of @ivalue * @task_ns: user namespace of the caller */ static vfsuid_t rootid_from_xattr(const void *value, size_t size, struct user_namespace *task_ns) { const struct vfs_ns_cap_data *nscap = value; uid_t rootid = 0; if (size == XATTR_CAPS_SZ_3) rootid = le32_to_cpu(nscap->rootid); return VFSUIDT_INIT(make_kuid(task_ns, rootid)); } static bool validheader(size_t size, const struct vfs_cap_data *cap) { return is_v2header(size, cap) || is_v3header(size, cap); } /** * cap_convert_nscap - check vfs caps * * @idmap: idmap of the mount the inode was found from * @dentry: used to retrieve inode to check permissions on * @ivalue: vfs caps value which may be modified by this function * @size: size of @ivalue * * User requested a write of security.capability. If needed, update the * xattr to change from v2 to v3, or to fixup the v3 rootid. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * Return: On success, return the new size; on error, return < 0. */ int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry, const void **ivalue, size_t size) { struct vfs_ns_cap_data *nscap; uid_t nsrootid; const struct vfs_cap_data *cap = *ivalue; __u32 magic, nsmagic; struct inode *inode = d_backing_inode(dentry); struct user_namespace *task_ns = current_user_ns(), *fs_ns = inode->i_sb->s_user_ns; kuid_t rootid; vfsuid_t vfsrootid; size_t newsize; if (!*ivalue) return -EINVAL; if (!validheader(size, cap)) return -EINVAL; if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP)) return -EPERM; if (size == XATTR_CAPS_SZ_2 && (idmap == &nop_mnt_idmap)) if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP)) /* user is privileged, just write the v2 */ return size; vfsrootid = rootid_from_xattr(*ivalue, size, task_ns); if (!vfsuid_valid(vfsrootid)) return -EINVAL; rootid = from_vfsuid(idmap, fs_ns, vfsrootid); if (!uid_valid(rootid)) return -EINVAL; nsrootid = from_kuid(fs_ns, rootid); if (nsrootid == -1) return -EINVAL; newsize = sizeof(struct vfs_ns_cap_data); nscap = kmalloc(newsize, GFP_ATOMIC); if (!nscap) return -ENOMEM; nscap->rootid = cpu_to_le32(nsrootid); nsmagic = VFS_CAP_REVISION_3; magic = le32_to_cpu(cap->magic_etc); if (magic & VFS_CAP_FLAGS_EFFECTIVE) nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; nscap->magic_etc = cpu_to_le32(nsmagic); memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); *ivalue = nscap; return newsize; } /* * Calculate the new process capability sets from the capability sets attached * to a file. */ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, struct linux_binprm *bprm, bool *effective, bool *has_fcap) { struct cred *new = bprm->cred; int ret = 0; if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE) *effective = true; if (caps->magic_etc & VFS_CAP_REVISION_MASK) *has_fcap = true; /* * pP' = (X & fP) | (pI & fI) * The addition of pA' is handled later. */ new->cap_permitted.val = (new->cap_bset.val & caps->permitted.val) | (new->cap_inheritable.val & caps->inheritable.val); if (caps->permitted.val & ~new->cap_permitted.val) /* insufficient to execute correctly */ ret = -EPERM; /* * For legacy apps, with no internal support for recognizing they * do not have enough capabilities, we return an error if they are * missing some "forced" (aka file-permitted) capabilities. */ return *effective ? ret : 0; } /** * get_vfs_caps_from_disk - retrieve vfs caps from disk * * @idmap: idmap of the mount the inode was found from * @dentry: dentry from which @inode is retrieved * @cpu_caps: vfs capabilities * * Extract the on-exec-apply capability sets for an executable file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ int get_vfs_caps_from_disk(struct mnt_idmap *idmap, const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps) { struct inode *inode = d_backing_inode(dentry); __u32 magic_etc; int size; struct vfs_ns_cap_data data, *nscaps = &data; struct vfs_cap_data *caps = (struct vfs_cap_data *) &data; kuid_t rootkuid; vfsuid_t rootvfsuid; struct user_namespace *fs_ns; memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data)); if (!inode) return -ENODATA; fs_ns = inode->i_sb->s_user_ns; size = __vfs_getxattr((struct dentry *)dentry, inode, XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ); if (size == -ENODATA || size == -EOPNOTSUPP) /* no data, that's ok */ return -ENODATA; if (size < 0) return size; if (size < sizeof(magic_etc)) return -EINVAL; cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc); rootkuid = make_kuid(fs_ns, 0); switch (magic_etc & VFS_CAP_REVISION_MASK) { case VFS_CAP_REVISION_1: if (size != XATTR_CAPS_SZ_1) return -EINVAL; break; case VFS_CAP_REVISION_2: if (size != XATTR_CAPS_SZ_2) return -EINVAL; break; case VFS_CAP_REVISION_3: if (size != XATTR_CAPS_SZ_3) return -EINVAL; rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid)); break; default: return -EINVAL; } rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid); if (!vfsuid_valid(rootvfsuid)) return -ENODATA; /* Limit the caps to the mounter of the filesystem * or the more limited uid specified in the xattr. */ if (!vfsuid_root_in_currentns(rootvfsuid)) return -ENODATA; cpu_caps->permitted.val = le32_to_cpu(caps->data[0].permitted); cpu_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable); /* * Rev1 had just a single 32-bit word, later expanded * to a second one for the high bits */ if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) { cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32; cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32; } cpu_caps->permitted.val &= CAP_VALID_MASK; cpu_caps->inheritable.val &= CAP_VALID_MASK; cpu_caps->rootid = vfsuid_into_kuid(rootvfsuid); return 0; } /* * Attempt to get the on-exec apply capability sets for an executable file from * its xattrs and, if present, apply them to the proposed credentials being * constructed by execve(). */ static int get_file_caps(struct linux_binprm *bprm, const struct file *file, bool *effective, bool *has_fcap) { int rc = 0; struct cpu_vfs_cap_data vcaps; cap_clear(bprm->cred->cap_permitted); if (!file_caps_enabled) return 0; if (!mnt_may_suid(file->f_path.mnt)) return 0; /* * This check is redundant with mnt_may_suid() but is kept to make * explicit that capability bits are limited to s_user_ns and its * descendants. */ if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns)) return 0; rc = get_vfs_caps_from_disk(file_mnt_idmap(file), file->f_path.dentry, &vcaps); if (rc < 0) { if (rc == -EINVAL) printk(KERN_NOTICE "Invalid argument reading file caps for %s\n", bprm->filename); else if (rc == -ENODATA) rc = 0; goto out; } rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap); out: if (rc) cap_clear(bprm->cred->cap_permitted); return rc; } static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); } static inline bool __is_real(kuid_t uid, struct cred *cred) { return uid_eq(cred->uid, uid); } static inline bool __is_eff(kuid_t uid, struct cred *cred) { return uid_eq(cred->euid, uid); } static inline bool __is_suid(kuid_t uid, struct cred *cred) { return !__is_real(uid, cred) && __is_eff(uid, cred); } /* * handle_privileged_root - Handle case of privileged root * @bprm: The execution parameters, including the proposed creds * @has_fcap: Are any file capabilities set? * @effective: Do we have effective root privilege? * @root_uid: This namespace' root UID WRT initial USER namespace * * Handle the case where root is privileged and hasn't been neutered by * SECURE_NOROOT. If file capabilities are set, they won't be combined with * set UID root and nothing is changed. If we are root, cap_permitted is * updated. If we have become set UID root, the effective bit is set. */ static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap, bool *effective, kuid_t root_uid) { const struct cred *old = current_cred(); struct cred *new = bprm->cred; if (!root_privileged()) return; /* * If the legacy file capability is set, then don't set privs * for a setuid root binary run by a non-root user. Do set it * for a root user just to cause least surprise to an admin. */ if (has_fcap && __is_suid(root_uid, new)) { warn_setuid_and_fcaps_mixed(bprm->filename); return; } /* * To support inheritance of root-permissions and suid-root * executables under compatibility mode, we override the * capability sets for the file. */ if (__is_eff(root_uid, new) || __is_real(root_uid, new)) { /* pP' = (cap_bset & ~0) | (pI & ~0) */ new->cap_permitted = cap_combine(old->cap_bset, old->cap_inheritable); } /* * If only the real uid is 0, we do not set the effective bit. */ if (__is_eff(root_uid, new)) *effective = true; } #define __cap_gained(field, target, source) \ !cap_issubset(target->cap_##field, source->cap_##field) #define __cap_grew(target, source, cred) \ !cap_issubset(cred->cap_##target, cred->cap_##source) #define __cap_full(field, cred) \ cap_issubset(CAP_FULL_SET, cred->cap_##field) /* * 1) Audit candidate if current->cap_effective is set * * We do not bother to audit if 3 things are true: * 1) cap_effective has all caps * 2) we became root *OR* are were already root * 3) root is supposed to have all caps (SECURE_NOROOT) * Since this is just a normal root execing a process. * * Number 1 above might fail if you don't have a full bset, but I think * that is interesting information to audit. * * A number of other conditions require logging: * 2) something prevented setuid root getting all caps * 3) non-setuid root gets fcaps * 4) non-setuid root gets ambient */ static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old, kuid_t root, bool has_fcap) { bool ret = false; if ((__cap_grew(effective, ambient, new) && !(__cap_full(effective, new) && (__is_eff(root, new) || __is_real(root, new)) && root_privileged())) || (root_privileged() && __is_suid(root, new) && !__cap_full(effective, new)) || (uid_eq(new->euid, old->euid) && ((has_fcap && __cap_gained(permitted, new, old)) || __cap_gained(ambient, new, old)))) ret = true; return ret; } /** * cap_bprm_creds_from_file - Set up the proposed credentials for execve(). * @bprm: The execution parameters, including the proposed creds * @file: The file to pull the credentials from * * Set up the proposed credentials for a new execution context being * constructed by execve(). The proposed creds in @bprm->cred is altered, * which won't take effect immediately. * * Return: 0 if successful, -ve on error. */ int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file) { /* Process setpcap binaries and capabilities for uid 0 */ const struct cred *old = current_cred(); struct cred *new = bprm->cred; bool effective = false, has_fcap = false, id_changed; int ret; kuid_t root_uid; if (WARN_ON(!cap_ambient_invariant_ok(old))) return -EPERM; ret = get_file_caps(bprm, file, &effective, &has_fcap); if (ret < 0) return ret; root_uid = make_kuid(new->user_ns, 0); handle_privileged_root(bprm, has_fcap, &effective, root_uid); /* if we have fs caps, clear dangerous personality flags */ if (__cap_gained(permitted, new, old)) bprm->per_clear |= PER_CLEAR_ON_SETID; /* Don't let someone trace a set[ug]id/setpcap binary with the revised * credentials unless they have the appropriate permit. * * In addition, if NO_NEW_PRIVS, then ensure we get no new privs. */ id_changed = !uid_eq(new->euid, old->euid) || !in_group_p(new->egid); if ((id_changed || __cap_gained(permitted, new, old)) && ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) || !ptracer_capable(current, new->user_ns))) { /* downgrade; they get no more than they had, and maybe less */ if (!ns_capable(new->user_ns, CAP_SETUID) || (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) { new->euid = new->uid; new->egid = new->gid; } new->cap_permitted = cap_intersect(new->cap_permitted, old->cap_permitted); } new->suid = new->fsuid = new->euid; new->sgid = new->fsgid = new->egid; /* File caps or setid cancels ambient. */ if (has_fcap || id_changed) cap_clear(new->cap_ambient); /* * Now that we've computed pA', update pP' to give: * pP' = (X & fP) | (pI & fI) | pA' */ new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient); /* * Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set, * this is the same as pE' = (fE ? pP' : 0) | pA'. */ if (effective) new->cap_effective = new->cap_permitted; else new->cap_effective = new->cap_ambient; if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EPERM; if (nonroot_raised_pE(new, old, root_uid, has_fcap)) { ret = audit_log_bprm_fcaps(bprm, new, old); if (ret < 0) return ret; } new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EPERM; /* Check for privilege-elevated exec. */ if (id_changed || !uid_eq(new->euid, old->uid) || !gid_eq(new->egid, old->gid) || (!__is_real(root_uid, new) && (effective || __cap_grew(permitted, ambient, new)))) bprm->secureexec = 1; return 0; } /** * cap_inode_setxattr - Determine whether an xattr may be altered * @dentry: The inode/dentry being altered * @name: The name of the xattr to be changed * @value: The value that the xattr will be changed to * @size: The size of value * @flags: The replacement flag * * Determine whether an xattr may be altered or set on an inode, returning 0 if * permission is granted, -ve if denied. * * This is used to make sure security xattrs don't get updated or set by those * who aren't privileged to do so. */ int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct user_namespace *user_ns = dentry->d_sb->s_user_ns; /* Ignore non-security xattrs */ if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) != 0) return 0; /* * For XATTR_NAME_CAPS the check will be done in * cap_convert_nscap(), called by setxattr() */ if (strcmp(name, XATTR_NAME_CAPS) == 0) return 0; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } /** * cap_inode_removexattr - Determine whether an xattr may be removed * * @idmap: idmap of the mount the inode was found from * @dentry: The inode/dentry being altered * @name: The name of the xattr to be changed * * Determine whether an xattr may be removed from an inode, returning 0 if * permission is granted, -ve if denied. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * This is used to make sure security xattrs don't get removed by those who * aren't privileged to remove them. */ int cap_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct user_namespace *user_ns = dentry->d_sb->s_user_ns; /* Ignore non-security xattrs */ if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) != 0) return 0; if (strcmp(name, XATTR_NAME_CAPS) == 0) { /* security.capability gets namespaced */ struct inode *inode = d_backing_inode(dentry); if (!inode) return -EINVAL; if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP)) return -EPERM; return 0; } if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } /* * cap_emulate_setxuid() fixes the effective / permitted capabilities of * a process after a call to setuid, setreuid, or setresuid. * * 1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of * {r,e,s}uid != 0, the permitted and effective capabilities are * cleared. * * 2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective * capabilities of the process are cleared. * * 3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective * capabilities are set to the permitted capabilities. * * fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should * never happen. * * -astor * * cevans - New behaviour, Oct '99 * A process may, via prctl(), elect to keep its capabilities when it * calls setuid() and switches away from uid==0. Both permitted and * effective sets will be retained. * Without this change, it was impossible for a daemon to drop only some * of its privilege. The call to setuid(!=0) would drop all privileges! * Keeping uid 0 is not an option because uid 0 owns too many vital * files.. * Thanks to Olaf Kirch and Peter Benie for spotting this. */ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old) { kuid_t root_uid = make_kuid(old->user_ns, 0); if ((uid_eq(old->uid, root_uid) || uid_eq(old->euid, root_uid) || uid_eq(old->suid, root_uid)) && (!uid_eq(new->uid, root_uid) && !uid_eq(new->euid, root_uid) && !uid_eq(new->suid, root_uid))) { if (!issecure(SECURE_KEEP_CAPS)) { cap_clear(new->cap_permitted); cap_clear(new->cap_effective); } /* * Pre-ambient programs expect setresuid to nonroot followed * by exec to drop capabilities. We should make sure that * this remains the case. */ cap_clear(new->cap_ambient); } if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid)) cap_clear(new->cap_effective); if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid)) new->cap_effective = new->cap_permitted; } /** * cap_task_fix_setuid - Fix up the results of setuid() call * @new: The proposed credentials * @old: The current task's current credentials * @flags: Indications of what has changed * * Fix up the results of setuid() call before the credential changes are * actually applied. * * Return: 0 to grant the changes, -ve to deny them. */ int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { switch (flags) { case LSM_SETID_RE: case LSM_SETID_ID: case LSM_SETID_RES: /* juggle the capabilities to follow [RES]UID changes unless * otherwise suppressed */ if (!issecure(SECURE_NO_SETUID_FIXUP)) cap_emulate_setxuid(new, old); break; case LSM_SETID_FS: /* juggle the capabilities to follow FSUID changes, unless * otherwise suppressed * * FIXME - is fsuser used for all CAP_FS_MASK capabilities? * if not, we might be a bit too harsh here. */ if (!issecure(SECURE_NO_SETUID_FIXUP)) { kuid_t root_uid = make_kuid(old->user_ns, 0); if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid)) new->cap_effective = cap_drop_fs_set(new->cap_effective); if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid)) new->cap_effective = cap_raise_fs_set(new->cap_effective, new->cap_permitted); } break; default: return -EINVAL; } return 0; } /* * Rationale: code calling task_setscheduler, task_setioprio, and * task_setnice, assumes that * . if capable(cap_sys_nice), then those actions should be allowed * . if not capable(cap_sys_nice), but acting on your own processes, * then those actions should be allowed * This is insufficient now since you can call code without suid, but * yet with increased caps. * So we check for increased caps on the target process. */ static int cap_safe_nice(struct task_struct *p) { int is_subset, ret = 0; rcu_read_lock(); is_subset = cap_issubset(__task_cred(p)->cap_permitted, current_cred()->cap_permitted); if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) ret = -EPERM; rcu_read_unlock(); return ret; } /** * cap_task_setscheduler - Determine if scheduler policy change is permitted * @p: The task to affect * * Determine if the requested scheduler policy change is permitted for the * specified task. * * Return: 0 if permission is granted, -ve if denied. */ int cap_task_setscheduler(struct task_struct *p) { return cap_safe_nice(p); } /** * cap_task_setioprio - Determine if I/O priority change is permitted * @p: The task to affect * @ioprio: The I/O priority to set * * Determine if the requested I/O priority change is permitted for the specified * task. * * Return: 0 if permission is granted, -ve if denied. */ int cap_task_setioprio(struct task_struct *p, int ioprio) { return cap_safe_nice(p); } /** * cap_task_setnice - Determine if task priority change is permitted * @p: The task to affect * @nice: The nice value to set * * Determine if the requested task priority change is permitted for the * specified task. * * Return: 0 if permission is granted, -ve if denied. */ int cap_task_setnice(struct task_struct *p, int nice) { return cap_safe_nice(p); } /* * Implement PR_CAPBSET_DROP. Attempt to remove the specified capability from * the current task's bounding set. Returns 0 on success, -ve on error. */ static int cap_prctl_drop(unsigned long cap) { struct cred *new; if (!ns_capable(current_user_ns(), CAP_SETPCAP)) return -EPERM; if (!cap_valid(cap)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; cap_lower(new->cap_bset, cap); return commit_creds(new); } /** * cap_task_prctl - Implement process control functions for this security module * @option: The process control function requested * @arg2: The argument data for this function * @arg3: The argument data for this function * @arg4: The argument data for this function * @arg5: The argument data for this function * * Allow process control functions (sys_prctl()) to alter capabilities; may * also deny access to other functions not otherwise implemented here. * * Return: 0 or +ve on success, -ENOSYS if this function is not implemented * here, other -ve on error. If -ENOSYS is returned, sys_prctl() and other LSM * modules will consider performing the function. */ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { const struct cred *old = current_cred(); struct cred *new; switch (option) { case PR_CAPBSET_READ: if (!cap_valid(arg2)) return -EINVAL; return !!cap_raised(old->cap_bset, arg2); case PR_CAPBSET_DROP: return cap_prctl_drop(arg2); /* * The next four prctl's remain to assist with transitioning a * system from legacy UID=0 based privilege (when filesystem * capabilities are not in use) to a system using filesystem * capabilities only - as the POSIX.1e draft intended. * * Note: * * PR_SET_SECUREBITS = * issecure_mask(SECURE_KEEP_CAPS_LOCKED) * | issecure_mask(SECURE_NOROOT) * | issecure_mask(SECURE_NOROOT_LOCKED) * | issecure_mask(SECURE_NO_SETUID_FIXUP) * | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED) * * will ensure that the current process and all of its * children will be locked into a pure * capability-based-privilege environment. */ case PR_SET_SECUREBITS: if ((((old->securebits & SECURE_ALL_LOCKS) >> 1) & (old->securebits ^ arg2)) /*[1]*/ || ((old->securebits & SECURE_ALL_LOCKS & ~arg2)) /*[2]*/ || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/ /* * [1] no changing of bits that are locked * [2] no unlocking of locks * [3] no setting of unsupported bits */ ) /* cannot change a locked bit */ return -EPERM; /* * Doing anything requires privilege (go read about the * "sendmail capabilities bug"), except for unprivileged bits. * Indeed, the SECURE_ALL_UNPRIVILEGED bits are not * restrictions enforced by the kernel but by user space on * itself. */ if (cap_capable(current_cred(), current_cred()->user_ns, CAP_SETPCAP, CAP_OPT_NONE) != 0) { const unsigned long unpriv_and_locks = SECURE_ALL_UNPRIVILEGED | SECURE_ALL_UNPRIVILEGED << 1; const unsigned long changed = old->securebits ^ arg2; /* For legacy reason, denies non-change. */ if (!changed) return -EPERM; /* Denies privileged changes. */ if (changed & ~unpriv_and_locks) return -EPERM; } new = prepare_creds(); if (!new) return -ENOMEM; new->securebits = arg2; return commit_creds(new); case PR_GET_SECUREBITS: return old->securebits; case PR_GET_KEEPCAPS: return !!issecure(SECURE_KEEP_CAPS); case PR_SET_KEEPCAPS: if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */ return -EINVAL; if (issecure(SECURE_KEEP_CAPS_LOCKED)) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (arg2) new->securebits |= issecure_mask(SECURE_KEEP_CAPS); else new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); return commit_creds(new); case PR_CAP_AMBIENT: if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) { if (arg3 | arg4 | arg5) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; cap_clear(new->cap_ambient); return commit_creds(new); } if (((!cap_valid(arg3)) | arg4 | arg5)) return -EINVAL; if (arg2 == PR_CAP_AMBIENT_IS_SET) { return !!cap_raised(current_cred()->cap_ambient, arg3); } else if (arg2 != PR_CAP_AMBIENT_RAISE && arg2 != PR_CAP_AMBIENT_LOWER) { return -EINVAL; } else { if (arg2 == PR_CAP_AMBIENT_RAISE && (!cap_raised(current_cred()->cap_permitted, arg3) || !cap_raised(current_cred()->cap_inheritable, arg3) || issecure(SECURE_NO_CAP_AMBIENT_RAISE))) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (arg2 == PR_CAP_AMBIENT_RAISE) cap_raise(new->cap_ambient, arg3); else cap_lower(new->cap_ambient, arg3); return commit_creds(new); } default: /* No functionality available - continue with default */ return -ENOSYS; } } /** * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted * @mm: The VM space in which the new mapping is to be made * @pages: The size of the mapping * * Determine whether the allocation of a new virtual mapping by the current * task is permitted. * * Return: 0 if permission granted, negative error code if not. */ int cap_vm_enough_memory(struct mm_struct *mm, long pages) { return cap_capable(current_cred(), &init_user_ns, CAP_SYS_ADMIN, CAP_OPT_NOAUDIT); } /** * cap_mmap_addr - check if able to map given addr * @addr: address attempting to be mapped * * If the process is attempting to map memory below dac_mmap_min_addr they need * CAP_SYS_RAWIO. The other parameters to this function are unused by the * capability security module. * * Return: 0 if this mapping should be allowed or -EPERM if not. */ int cap_mmap_addr(unsigned long addr) { int ret = 0; if (addr < dac_mmap_min_addr) { ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO, CAP_OPT_NONE); /* set PF_SUPERPRIV if it turns out we allow the low mmap */ if (ret == 0) current->flags |= PF_SUPERPRIV; } return ret; } #ifdef CONFIG_SECURITY static const struct lsm_id capability_lsmid = { .name = "capability", .id = LSM_ID_CAPABILITY, }; static struct security_hook_list capability_hooks[] __ro_after_init = { LSM_HOOK_INIT(capable, cap_capable), LSM_HOOK_INIT(settime, cap_settime), LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme), LSM_HOOK_INIT(capget, cap_capget), LSM_HOOK_INIT(capset, cap_capset), LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file), LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv), LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv), LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity), LSM_HOOK_INIT(mmap_addr, cap_mmap_addr), LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid), LSM_HOOK_INIT(task_prctl, cap_task_prctl), LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler), LSM_HOOK_INIT(task_setioprio, cap_task_setioprio), LSM_HOOK_INIT(task_setnice, cap_task_setnice), LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory), }; static int __init capability_init(void) { security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks), &capability_lsmid); return 0; } DEFINE_LSM(capability) = { .id = &capability_lsmid, .order = LSM_ORDER_FIRST, .init = capability_init, }; #endif /* CONFIG_SECURITY */
83 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 #ifndef IO_URING_MEMMAP_H #define IO_URING_MEMMAP_H #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL #define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL #define IORING_OFF_ZCRX_SHIFT 16 struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages); #ifndef CONFIG_MMU unsigned int io_uring_nommu_mmap_capabilities(struct file *file); #endif unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); int io_uring_mmap(struct file *file, struct vm_area_struct *vma); void io_free_region(struct user_struct *user, struct io_mapped_region *mr); int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg, unsigned long mmap_offset); static inline void *io_region_get_ptr(struct io_mapped_region *mr) { return mr->ptr; } static inline bool io_region_is_set(struct io_mapped_region *mr) { return !!mr->nr_pages; } static inline void io_region_publish(struct io_ring_ctx *ctx, struct io_mapped_region *src_region, struct io_mapped_region *dst_region) { /* * Once published mmap can find it without holding only the ->mmap_lock * and not ->uring_lock. */ guard(mutex)(&ctx->mmap_lock); *dst_region = *src_region; } static inline size_t io_region_size(struct io_mapped_region *mr) { return (size_t) mr->nr_pages << PAGE_SHIFT; } #endif
5 1 5 4 3 2 2 1 5 5 218 215 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2011, Intel Corporation. * * Description: Data Center Bridging netlink interface * Author: Lucy Liu <lucy.liu@intel.com> */ #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/slab.h> #include <net/netlink.h> #include <net/rtnetlink.h> #include <linux/dcbnl.h> #include <net/dcbevent.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <net/sock.h> /* Data Center Bridging (DCB) is a collection of Ethernet enhancements * intended to allow network traffic with differing requirements * (highly reliable, no drops vs. best effort vs. low latency) to operate * and co-exist on Ethernet. Current DCB features are: * * Enhanced Transmission Selection (aka Priority Grouping [PG]) - provides a * framework for assigning bandwidth guarantees to traffic classes. * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. * * More information about the emerging standards for these Ethernet features * can be found at: http://www.ieee802.org/1/pages/dcbridges.html * * This file implements an rtnetlink interface to allow configuration of DCB * features for capable devices. */ /**************** DCB attribute policies *************************************/ /* DCB netlink attributes policy */ static const struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = { [DCB_ATTR_IFNAME] = {.type = NLA_NUL_STRING, .len = IFNAMSIZ - 1}, [DCB_ATTR_STATE] = {.type = NLA_U8}, [DCB_ATTR_PFC_CFG] = {.type = NLA_NESTED}, [DCB_ATTR_PG_CFG] = {.type = NLA_NESTED}, [DCB_ATTR_SET_ALL] = {.type = NLA_U8}, [DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG}, [DCB_ATTR_CAP] = {.type = NLA_NESTED}, [DCB_ATTR_PFC_STATE] = {.type = NLA_U8}, [DCB_ATTR_BCN] = {.type = NLA_NESTED}, [DCB_ATTR_APP] = {.type = NLA_NESTED}, [DCB_ATTR_IEEE] = {.type = NLA_NESTED}, [DCB_ATTR_DCBX] = {.type = NLA_U8}, [DCB_ATTR_FEATCFG] = {.type = NLA_NESTED}, }; /* DCB priority flow control to User Priority nested attributes */ static const struct nla_policy dcbnl_pfc_up_nest[DCB_PFC_UP_ATTR_MAX + 1] = { [DCB_PFC_UP_ATTR_0] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_1] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_2] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_3] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_4] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_5] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_6] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_7] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_ALL] = {.type = NLA_FLAG}, }; /* DCB priority grouping nested attributes */ static const struct nla_policy dcbnl_pg_nest[DCB_PG_ATTR_MAX + 1] = { [DCB_PG_ATTR_TC_0] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_1] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_2] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_3] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_4] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_5] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_6] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_7] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_ALL] = {.type = NLA_NESTED}, [DCB_PG_ATTR_BW_ID_0] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_1] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_2] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_3] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_4] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_5] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_6] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_7] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_ALL] = {.type = NLA_FLAG}, }; /* DCB traffic class nested attributes. */ static const struct nla_policy dcbnl_tc_param_nest[DCB_TC_ATTR_PARAM_MAX + 1] = { [DCB_TC_ATTR_PARAM_PGID] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_UP_MAPPING] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_STRICT_PRIO] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_BW_PCT] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_ALL] = {.type = NLA_FLAG}, }; /* DCB capabilities nested attributes. */ static const struct nla_policy dcbnl_cap_nest[DCB_CAP_ATTR_MAX + 1] = { [DCB_CAP_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_CAP_ATTR_PG] = {.type = NLA_U8}, [DCB_CAP_ATTR_PFC] = {.type = NLA_U8}, [DCB_CAP_ATTR_UP2TC] = {.type = NLA_U8}, [DCB_CAP_ATTR_PG_TCS] = {.type = NLA_U8}, [DCB_CAP_ATTR_PFC_TCS] = {.type = NLA_U8}, [DCB_CAP_ATTR_GSP] = {.type = NLA_U8}, [DCB_CAP_ATTR_BCN] = {.type = NLA_U8}, [DCB_CAP_ATTR_DCBX] = {.type = NLA_U8}, }; /* DCB capabilities nested attributes. */ static const struct nla_policy dcbnl_numtcs_nest[DCB_NUMTCS_ATTR_MAX + 1] = { [DCB_NUMTCS_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_NUMTCS_ATTR_PG] = {.type = NLA_U8}, [DCB_NUMTCS_ATTR_PFC] = {.type = NLA_U8}, }; /* DCB BCN nested attributes. */ static const struct nla_policy dcbnl_bcn_nest[DCB_BCN_ATTR_MAX + 1] = { [DCB_BCN_ATTR_RP_0] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_1] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_2] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_3] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_4] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_5] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_6] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_7] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_ALL] = {.type = NLA_FLAG}, [DCB_BCN_ATTR_BCNA_0] = {.type = NLA_U32}, [DCB_BCN_ATTR_BCNA_1] = {.type = NLA_U32}, [DCB_BCN_ATTR_ALPHA] = {.type = NLA_U32}, [DCB_BCN_ATTR_BETA] = {.type = NLA_U32}, [DCB_BCN_ATTR_GD] = {.type = NLA_U32}, [DCB_BCN_ATTR_GI] = {.type = NLA_U32}, [DCB_BCN_ATTR_TMAX] = {.type = NLA_U32}, [DCB_BCN_ATTR_TD] = {.type = NLA_U32}, [DCB_BCN_ATTR_RMIN] = {.type = NLA_U32}, [DCB_BCN_ATTR_W] = {.type = NLA_U32}, [DCB_BCN_ATTR_RD] = {.type = NLA_U32}, [DCB_BCN_ATTR_RU] = {.type = NLA_U32}, [DCB_BCN_ATTR_WRTT] = {.type = NLA_U32}, [DCB_BCN_ATTR_RI] = {.type = NLA_U32}, [DCB_BCN_ATTR_C] = {.type = NLA_U32}, [DCB_BCN_ATTR_ALL] = {.type = NLA_FLAG}, }; /* DCB APP nested attributes. */ static const struct nla_policy dcbnl_app_nest[DCB_APP_ATTR_MAX + 1] = { [DCB_APP_ATTR_IDTYPE] = {.type = NLA_U8}, [DCB_APP_ATTR_ID] = {.type = NLA_U16}, [DCB_APP_ATTR_PRIORITY] = {.type = NLA_U8}, }; /* IEEE 802.1Qaz nested attributes. */ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = { [DCB_ATTR_IEEE_ETS] = {.len = sizeof(struct ieee_ets)}, [DCB_ATTR_IEEE_PFC] = {.len = sizeof(struct ieee_pfc)}, [DCB_ATTR_IEEE_APP_TABLE] = {.type = NLA_NESTED}, [DCB_ATTR_IEEE_MAXRATE] = {.len = sizeof(struct ieee_maxrate)}, [DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)}, [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)}, [DCB_ATTR_DCB_BUFFER] = {.len = sizeof(struct dcbnl_buffer)}, [DCB_ATTR_DCB_APP_TRUST_TABLE] = {.type = NLA_NESTED}, }; /* DCB number of traffic classes nested attributes. */ static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = { [DCB_FEATCFG_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_FEATCFG_ATTR_PG] = {.type = NLA_U8}, [DCB_FEATCFG_ATTR_PFC] = {.type = NLA_U8}, [DCB_FEATCFG_ATTR_APP] = {.type = NLA_U8}, }; static LIST_HEAD(dcb_app_list); static LIST_HEAD(dcb_rewr_list); static DEFINE_SPINLOCK(dcb_lock); static enum ieee_attrs_app dcbnl_app_attr_type_get(u8 selector) { switch (selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: case IEEE_8021QAZ_APP_SEL_STREAM: case IEEE_8021QAZ_APP_SEL_DGRAM: case IEEE_8021QAZ_APP_SEL_ANY: case IEEE_8021QAZ_APP_SEL_DSCP: return DCB_ATTR_IEEE_APP; case DCB_APP_SEL_PCP: return DCB_ATTR_DCB_APP; default: return DCB_ATTR_IEEE_APP_UNSPEC; } } static bool dcbnl_app_attr_type_validate(enum ieee_attrs_app type) { switch (type) { case DCB_ATTR_IEEE_APP: case DCB_ATTR_DCB_APP: return true; default: return false; } } static bool dcbnl_app_selector_validate(enum ieee_attrs_app type, u8 selector) { return dcbnl_app_attr_type_get(selector) == type; } static struct sk_buff *dcbnl_newmsg(int type, u8 cmd, u32 port, u32 seq, u32 flags, struct nlmsghdr **nlhp) { struct sk_buff *skb; struct dcbmsg *dcb; struct nlmsghdr *nlh; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return NULL; nlh = nlmsg_put(skb, port, seq, type, sizeof(*dcb), flags); BUG_ON(!nlh); dcb = nlmsg_data(nlh); dcb->dcb_family = AF_UNSPEC; dcb->cmd = cmd; dcb->dcb_pad = 0; if (nlhp) *nlhp = nlh; return skb; } static int dcbnl_getstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { /* if (!tb[DCB_ATTR_STATE] || !netdev->dcbnl_ops->getstate) */ if (!netdev->dcbnl_ops->getstate) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_STATE, netdev->dcbnl_ops->getstate(netdev)); } static int dcbnl_getpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_PFC_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->getpfccfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX, tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_PFC_CFG); if (!nest) return -EMSGSIZE; if (data[DCB_PFC_UP_ATTR_ALL]) getall = 1; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { if (!getall && !data[i]) continue; netdev->dcbnl_ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value); ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } nla_nest_end(skb, nest); return 0; } static int dcbnl_getperm_hwaddr(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 perm_addr[MAX_ADDR_LEN]; if (!netdev->dcbnl_ops->getpermhwaddr) return -EOPNOTSUPP; memset(perm_addr, 0, sizeof(perm_addr)); netdev->dcbnl_ops->getpermhwaddr(netdev, perm_addr); return nla_put(skb, DCB_ATTR_PERM_HWADDR, sizeof(perm_addr), perm_addr); } static int dcbnl_getcap(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_CAP_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_CAP]) return -EINVAL; if (!netdev->dcbnl_ops->getcap) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_CAP_ATTR_MAX, tb[DCB_ATTR_CAP], dcbnl_cap_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_CAP); if (!nest) return -EMSGSIZE; if (data[DCB_CAP_ATTR_ALL]) getall = 1; for (i = DCB_CAP_ATTR_ALL+1; i <= DCB_CAP_ATTR_MAX; i++) { if (!getall && !data[i]) continue; if (!netdev->dcbnl_ops->getcap(netdev, i, &value)) { ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } } nla_nest_end(skb, nest); return 0; } static int dcbnl_getnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_NUMTCS]) return -EINVAL; if (!netdev->dcbnl_ops->getnumtcs) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], dcbnl_numtcs_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_NUMTCS); if (!nest) return -EMSGSIZE; if (data[DCB_NUMTCS_ATTR_ALL]) getall = 1; for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) { if (!getall && !data[i]) continue; ret = netdev->dcbnl_ops->getnumtcs(netdev, i, &value); if (!ret) { ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } else return -EINVAL; } nla_nest_end(skb, nest); return 0; } static int dcbnl_setnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1]; int ret; u8 value; int i; if (!tb[DCB_ATTR_NUMTCS]) return -EINVAL; if (!netdev->dcbnl_ops->setnumtcs) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], dcbnl_numtcs_nest, NULL); if (ret) return ret; for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); ret = netdev->dcbnl_ops->setnumtcs(netdev, i, value); if (ret) break; } return nla_put_u8(skb, DCB_ATTR_NUMTCS, !!ret); } static int dcbnl_getpfcstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { if (!netdev->dcbnl_ops->getpfcstate) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_PFC_STATE, netdev->dcbnl_ops->getpfcstate(netdev)); } static int dcbnl_setpfcstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!tb[DCB_ATTR_PFC_STATE]) return -EINVAL; if (!netdev->dcbnl_ops->setpfcstate) return -EOPNOTSUPP; value = nla_get_u8(tb[DCB_ATTR_PFC_STATE]); netdev->dcbnl_ops->setpfcstate(netdev, value); return nla_put_u8(skb, DCB_ATTR_PFC_STATE, 0); } static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *app_nest; struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1]; u16 id; u8 up, idtype; int ret; if (!tb[DCB_ATTR_APP]) return -EINVAL; ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], dcbnl_app_nest, NULL); if (ret) return ret; /* all must be non-null */ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) || (!app_tb[DCB_APP_ATTR_ID])) return -EINVAL; /* either by eth type or by socket number */ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]); if ((idtype != DCB_APP_IDTYPE_ETHTYPE) && (idtype != DCB_APP_IDTYPE_PORTNUM)) return -EINVAL; id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); if (netdev->dcbnl_ops->getapp) { ret = netdev->dcbnl_ops->getapp(netdev, idtype, id); if (ret < 0) return ret; else up = ret; } else { struct dcb_app app = { .selector = idtype, .protocol = id, }; up = dcb_getapp(netdev, &app); } app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP); if (!app_nest) return -EMSGSIZE; ret = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE, idtype); if (ret) goto out_cancel; ret = nla_put_u16(skb, DCB_APP_ATTR_ID, id); if (ret) goto out_cancel; ret = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY, up); if (ret) goto out_cancel; nla_nest_end(skb, app_nest); return 0; out_cancel: nla_nest_cancel(skb, app_nest); return ret; } static int dcbnl_setapp(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { int ret; u16 id; u8 up, idtype; struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1]; if (!tb[DCB_ATTR_APP]) return -EINVAL; ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], dcbnl_app_nest, NULL); if (ret) return ret; /* all must be non-null */ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) || (!app_tb[DCB_APP_ATTR_ID]) || (!app_tb[DCB_APP_ATTR_PRIORITY])) return -EINVAL; /* either by eth type or by socket number */ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]); if ((idtype != DCB_APP_IDTYPE_ETHTYPE) && (idtype != DCB_APP_IDTYPE_PORTNUM)) return -EINVAL; id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); up = nla_get_u8(app_tb[DCB_APP_ATTR_PRIORITY]); if (netdev->dcbnl_ops->setapp) { ret = netdev->dcbnl_ops->setapp(netdev, idtype, id, up); if (ret < 0) return ret; } else { struct dcb_app app; app.selector = idtype; app.protocol = id; app.priority = up; ret = dcb_setapp(netdev, &app); } ret = nla_put_u8(skb, DCB_ATTR_APP, ret); dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SAPP, seq, 0); return ret; } static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, struct nlattr **tb, struct sk_buff *skb, int dir) { struct nlattr *pg_nest, *param_nest, *data; struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1]; struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1]; u8 prio, pgid, tc_pct, up_map; int ret; int getall = 0; int i; if (!tb[DCB_ATTR_PG_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->getpgtccfgtx || !netdev->dcbnl_ops->getpgtccfgrx || !netdev->dcbnl_ops->getpgbwgcfgtx || !netdev->dcbnl_ops->getpgbwgcfgrx) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest, NULL); if (ret) return ret; pg_nest = nla_nest_start_noflag(skb, DCB_ATTR_PG_CFG); if (!pg_nest) return -EMSGSIZE; if (pg_tb[DCB_PG_ATTR_TC_ALL]) getall = 1; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { if (!getall && !pg_tb[i]) continue; if (pg_tb[DCB_PG_ATTR_TC_ALL]) data = pg_tb[DCB_PG_ATTR_TC_ALL]; else data = pg_tb[i]; ret = nla_parse_nested_deprecated(param_tb, DCB_TC_ATTR_PARAM_MAX, data, dcbnl_tc_param_nest, NULL); if (ret) goto err_pg; param_nest = nla_nest_start_noflag(skb, i); if (!param_nest) goto err_pg; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (dir) { /* Rx */ netdev->dcbnl_ops->getpgtccfgrx(netdev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); } else { /* Tx */ netdev->dcbnl_ops->getpgtccfgtx(netdev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); } if (param_tb[DCB_TC_ATTR_PARAM_PGID] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_PGID, pgid); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct); if (ret) goto err_param; } nla_nest_end(skb, param_nest); } if (pg_tb[DCB_PG_ATTR_BW_ID_ALL]) getall = 1; else getall = 0; for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { if (!getall && !pg_tb[i]) continue; tc_pct = DCB_ATTR_VALUE_UNDEFINED; if (dir) { /* Rx */ netdev->dcbnl_ops->getpgbwgcfgrx(netdev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); } else { /* Tx */ netdev->dcbnl_ops->getpgbwgcfgtx(netdev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); } ret = nla_put_u8(skb, i, tc_pct); if (ret) goto err_pg; } nla_nest_end(skb, pg_nest); return 0; err_param: nla_nest_cancel(skb, param_nest); err_pg: nla_nest_cancel(skb, pg_nest); return -EMSGSIZE; } static int dcbnl_pgtx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 0); } static int dcbnl_pgrx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 1); } static int dcbnl_setstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!tb[DCB_ATTR_STATE]) return -EINVAL; if (!netdev->dcbnl_ops->setstate) return -EOPNOTSUPP; value = nla_get_u8(tb[DCB_ATTR_STATE]); return nla_put_u8(skb, DCB_ATTR_STATE, netdev->dcbnl_ops->setstate(netdev, value)); } static int dcbnl_setpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1]; int i; int ret; u8 value; if (!tb[DCB_ATTR_PFC_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->setpfccfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX, tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); if (ret) return ret; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); netdev->dcbnl_ops->setpfccfg(netdev, data[i]->nla_type - DCB_PFC_UP_ATTR_0, value); } return nla_put_u8(skb, DCB_ATTR_PFC_CFG, 0); } static int dcbnl_setall(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { int ret; if (!tb[DCB_ATTR_SET_ALL]) return -EINVAL; if (!netdev->dcbnl_ops->setall) return -EOPNOTSUPP; ret = nla_put_u8(skb, DCB_ATTR_SET_ALL, netdev->dcbnl_ops->setall(netdev)); dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SET_ALL, seq, 0); return ret; } static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb, int dir) { struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1]; struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1]; int ret; int i; u8 pgid; u8 up_map; u8 prio; u8 tc_pct; if (!tb[DCB_ATTR_PG_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->setpgtccfgtx || !netdev->dcbnl_ops->setpgtccfgrx || !netdev->dcbnl_ops->setpgbwgcfgtx || !netdev->dcbnl_ops->setpgbwgcfgrx) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest, NULL); if (ret) return ret; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { if (!pg_tb[i]) continue; ret = nla_parse_nested_deprecated(param_tb, DCB_TC_ATTR_PARAM_MAX, pg_tb[i], dcbnl_tc_param_nest, NULL); if (ret) return ret; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]) prio = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]); if (param_tb[DCB_TC_ATTR_PARAM_PGID]) pgid = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_PGID]); if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT]) tc_pct = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_BW_PCT]); if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]) up_map = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]); /* dir: Tx = 0, Rx = 1 */ if (dir) { /* Rx */ netdev->dcbnl_ops->setpgtccfgrx(netdev, i - DCB_PG_ATTR_TC_0, prio, pgid, tc_pct, up_map); } else { /* Tx */ netdev->dcbnl_ops->setpgtccfgtx(netdev, i - DCB_PG_ATTR_TC_0, prio, pgid, tc_pct, up_map); } } for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { if (!pg_tb[i]) continue; tc_pct = nla_get_u8(pg_tb[i]); /* dir: Tx = 0, Rx = 1 */ if (dir) { /* Rx */ netdev->dcbnl_ops->setpgbwgcfgrx(netdev, i - DCB_PG_ATTR_BW_ID_0, tc_pct); } else { /* Tx */ netdev->dcbnl_ops->setpgbwgcfgtx(netdev, i - DCB_PG_ATTR_BW_ID_0, tc_pct); } } return nla_put_u8(skb, DCB_ATTR_PG_CFG, 0); } static int dcbnl_pgtx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 0); } static int dcbnl_pgrx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 1); } static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *bcn_nest; struct nlattr *bcn_tb[DCB_BCN_ATTR_MAX + 1]; u8 value_byte; u32 value_integer; int ret; bool getall = false; int i; if (!tb[DCB_ATTR_BCN]) return -EINVAL; if (!netdev->dcbnl_ops->getbcnrp || !netdev->dcbnl_ops->getbcncfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(bcn_tb, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], dcbnl_bcn_nest, NULL); if (ret) return ret; bcn_nest = nla_nest_start_noflag(skb, DCB_ATTR_BCN); if (!bcn_nest) return -EMSGSIZE; if (bcn_tb[DCB_BCN_ATTR_ALL]) getall = true; for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) { if (!getall && !bcn_tb[i]) continue; netdev->dcbnl_ops->getbcnrp(netdev, i - DCB_BCN_ATTR_RP_0, &value_byte); ret = nla_put_u8(skb, i, value_byte); if (ret) goto err_bcn; } for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) { if (!getall && !bcn_tb[i]) continue; netdev->dcbnl_ops->getbcncfg(netdev, i, &value_integer); ret = nla_put_u32(skb, i, value_integer); if (ret) goto err_bcn; } nla_nest_end(skb, bcn_nest); return 0; err_bcn: nla_nest_cancel(skb, bcn_nest); return ret; } static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_BCN_ATTR_MAX + 1]; int i; int ret; u8 value_byte; u32 value_int; if (!tb[DCB_ATTR_BCN]) return -EINVAL; if (!netdev->dcbnl_ops->setbcncfg || !netdev->dcbnl_ops->setbcnrp) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], dcbnl_bcn_nest, NULL); if (ret) return ret; for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) { if (data[i] == NULL) continue; value_byte = nla_get_u8(data[i]); netdev->dcbnl_ops->setbcnrp(netdev, data[i]->nla_type - DCB_BCN_ATTR_RP_0, value_byte); } for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) { if (data[i] == NULL) continue; value_int = nla_get_u32(data[i]); netdev->dcbnl_ops->setbcncfg(netdev, i, value_int); } return nla_put_u8(skb, DCB_ATTR_BCN, 0); } static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb, int app_nested_type, int app_info_type, int app_entry_type) { struct dcb_peer_app_info info; struct dcb_app *table = NULL; const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; u16 app_count; int err; /** * retrieve the peer app configuration form the driver. If the driver * handlers fail exit without doing anything */ err = ops->peer_getappinfo(netdev, &info, &app_count); if (!err && app_count) { table = kmalloc_array(app_count, sizeof(struct dcb_app), GFP_KERNEL); if (!table) return -ENOMEM; err = ops->peer_getapptable(netdev, table); } if (!err) { u16 i; struct nlattr *app; /** * build the message, from here on the only possible failure * is due to the skb size */ err = -EMSGSIZE; app = nla_nest_start_noflag(skb, app_nested_type); if (!app) goto nla_put_failure; if (app_info_type && nla_put(skb, app_info_type, sizeof(info), &info)) goto nla_put_failure; for (i = 0; i < app_count; i++) { if (nla_put(skb, app_entry_type, sizeof(struct dcb_app), &table[i])) goto nla_put_failure; } nla_nest_end(skb, app); } err = 0; nla_put_failure: kfree(table); return err; } static int dcbnl_getapptrust(struct net_device *netdev, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; enum ieee_attrs_app type; struct nlattr *apptrust; int nselectors, err, i; u8 *selectors; selectors = kzalloc(IEEE_8021QAZ_APP_SEL_MAX + 1, GFP_KERNEL); if (!selectors) return -ENOMEM; err = ops->dcbnl_getapptrust(netdev, selectors, &nselectors); if (err) { err = 0; goto out; } apptrust = nla_nest_start(skb, DCB_ATTR_DCB_APP_TRUST_TABLE); if (!apptrust) { err = -EMSGSIZE; goto out; } for (i = 0; i < nselectors; i++) { type = dcbnl_app_attr_type_get(selectors[i]); err = nla_put_u8(skb, type, selectors[i]); if (err) { nla_nest_cancel(skb, apptrust); goto out; } } nla_nest_end(skb, apptrust); out: kfree(selectors); return err; } /* Set or delete APP table or rewrite table entries. The APP struct is validated * and the appropriate callback function is called. */ static int dcbnl_app_table_setdel(struct nlattr *attr, struct net_device *netdev, int (*setdel)(struct net_device *dev, struct dcb_app *app)) { struct dcb_app *app_data; enum ieee_attrs_app type; struct nlattr *attr_itr; int rem, err; nla_for_each_nested(attr_itr, attr, rem) { type = nla_type(attr_itr); if (!dcbnl_app_attr_type_validate(type)) continue; if (nla_len(attr_itr) < sizeof(struct dcb_app)) return -ERANGE; app_data = nla_data(attr_itr); if (!dcbnl_app_selector_validate(type, app_data->selector)) return -EINVAL; err = setdel(netdev, app_data); if (err) return err; } return 0; } /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee, *app, *rewr; struct dcb_app_type *itr; int dcbx; int err; if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name)) return -EMSGSIZE; ieee = nla_nest_start_noflag(skb, DCB_ATTR_IEEE); if (!ieee) return -EMSGSIZE; if (ops->ieee_getets) { struct ieee_ets ets; memset(&ets, 0, sizeof(ets)); err = ops->ieee_getets(netdev, &ets); if (!err && nla_put(skb, DCB_ATTR_IEEE_ETS, sizeof(ets), &ets)) return -EMSGSIZE; } if (ops->ieee_getmaxrate) { struct ieee_maxrate maxrate; memset(&maxrate, 0, sizeof(maxrate)); err = ops->ieee_getmaxrate(netdev, &maxrate); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_MAXRATE, sizeof(maxrate), &maxrate); if (err) return -EMSGSIZE; } } if (ops->ieee_getqcn) { struct ieee_qcn qcn; memset(&qcn, 0, sizeof(qcn)); err = ops->ieee_getqcn(netdev, &qcn); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_QCN, sizeof(qcn), &qcn); if (err) return -EMSGSIZE; } } if (ops->ieee_getqcnstats) { struct ieee_qcn_stats qcn_stats; memset(&qcn_stats, 0, sizeof(qcn_stats)); err = ops->ieee_getqcnstats(netdev, &qcn_stats); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_QCN_STATS, sizeof(qcn_stats), &qcn_stats); if (err) return -EMSGSIZE; } } if (ops->ieee_getpfc) { struct ieee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->ieee_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_IEEE_PFC, sizeof(pfc), &pfc)) return -EMSGSIZE; } if (ops->dcbnl_getbuffer) { struct dcbnl_buffer buffer; memset(&buffer, 0, sizeof(buffer)); err = ops->dcbnl_getbuffer(netdev, &buffer); if (!err && nla_put(skb, DCB_ATTR_DCB_BUFFER, sizeof(buffer), &buffer)) return -EMSGSIZE; } app = nla_nest_start_noflag(skb, DCB_ATTR_IEEE_APP_TABLE); if (!app) return -EMSGSIZE; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { enum ieee_attrs_app type = dcbnl_app_attr_type_get(itr->app.selector); err = nla_put(skb, type, sizeof(itr->app), &itr->app); if (err) { spin_unlock_bh(&dcb_lock); return -EMSGSIZE; } } } if (netdev->dcbnl_ops->getdcbx) dcbx = netdev->dcbnl_ops->getdcbx(netdev); else dcbx = -EOPNOTSUPP; spin_unlock_bh(&dcb_lock); nla_nest_end(skb, app); rewr = nla_nest_start(skb, DCB_ATTR_DCB_REWR_TABLE); if (!rewr) return -EMSGSIZE; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == netdev->ifindex) { enum ieee_attrs_app type = dcbnl_app_attr_type_get(itr->app.selector); err = nla_put(skb, type, sizeof(itr->app), &itr->app); if (err) { spin_unlock_bh(&dcb_lock); nla_nest_cancel(skb, rewr); return -EMSGSIZE; } } } spin_unlock_bh(&dcb_lock); nla_nest_end(skb, rewr); if (ops->dcbnl_getapptrust) { err = dcbnl_getapptrust(netdev, skb); if (err) return err; } /* get peer info if available */ if (ops->ieee_peer_getets) { struct ieee_ets ets; memset(&ets, 0, sizeof(ets)); err = ops->ieee_peer_getets(netdev, &ets); if (!err && nla_put(skb, DCB_ATTR_IEEE_PEER_ETS, sizeof(ets), &ets)) return -EMSGSIZE; } if (ops->ieee_peer_getpfc) { struct ieee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->ieee_peer_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_IEEE_PEER_PFC, sizeof(pfc), &pfc)) return -EMSGSIZE; } if (ops->peer_getappinfo && ops->peer_getapptable) { err = dcbnl_build_peer_app(netdev, skb, DCB_ATTR_IEEE_PEER_APP, DCB_ATTR_IEEE_APP_UNSPEC, DCB_ATTR_IEEE_APP); if (err) return -EMSGSIZE; } nla_nest_end(skb, ieee); if (dcbx >= 0) { err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx); if (err) return -EMSGSIZE; } return 0; } static int dcbnl_cee_pg_fill(struct sk_buff *skb, struct net_device *dev, int dir) { u8 pgid, up_map, prio, tc_pct; const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops; int i = dir ? DCB_ATTR_CEE_TX_PG : DCB_ATTR_CEE_RX_PG; struct nlattr *pg = nla_nest_start_noflag(skb, i); if (!pg) return -EMSGSIZE; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { struct nlattr *tc_nest = nla_nest_start_noflag(skb, i); if (!tc_nest) return -EMSGSIZE; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (!dir) ops->getpgtccfgrx(dev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); else ops->getpgtccfgtx(dev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); if (nla_put_u8(skb, DCB_TC_ATTR_PARAM_PGID, pgid) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct)) return -EMSGSIZE; nla_nest_end(skb, tc_nest); } for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { tc_pct = DCB_ATTR_VALUE_UNDEFINED; if (!dir) ops->getpgbwgcfgrx(dev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); else ops->getpgbwgcfgtx(dev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); if (nla_put_u8(skb, i, tc_pct)) return -EMSGSIZE; } nla_nest_end(skb, pg); return 0; } static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) { struct nlattr *cee, *app; struct dcb_app_type *itr; const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; int dcbx, i, err = -EMSGSIZE; u8 value; if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name)) goto nla_put_failure; cee = nla_nest_start_noflag(skb, DCB_ATTR_CEE); if (!cee) goto nla_put_failure; /* local pg */ if (ops->getpgtccfgtx && ops->getpgbwgcfgtx) { err = dcbnl_cee_pg_fill(skb, netdev, 1); if (err) goto nla_put_failure; } if (ops->getpgtccfgrx && ops->getpgbwgcfgrx) { err = dcbnl_cee_pg_fill(skb, netdev, 0); if (err) goto nla_put_failure; } /* local pfc */ if (ops->getpfccfg) { struct nlattr *pfc_nest = nla_nest_start_noflag(skb, DCB_ATTR_CEE_PFC); if (!pfc_nest) goto nla_put_failure; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value); if (nla_put_u8(skb, i, value)) goto nla_put_failure; } nla_nest_end(skb, pfc_nest); } /* local app */ spin_lock_bh(&dcb_lock); app = nla_nest_start_noflag(skb, DCB_ATTR_CEE_APP_TABLE); if (!app) goto dcb_unlock; list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { struct nlattr *app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP); if (!app_nest) goto dcb_unlock; err = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE, itr->app.selector); if (err) goto dcb_unlock; err = nla_put_u16(skb, DCB_APP_ATTR_ID, itr->app.protocol); if (err) goto dcb_unlock; err = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY, itr->app.priority); if (err) goto dcb_unlock; nla_nest_end(skb, app_nest); } } nla_nest_end(skb, app); if (netdev->dcbnl_ops->getdcbx) dcbx = netdev->dcbnl_ops->getdcbx(netdev); else dcbx = -EOPNOTSUPP; spin_unlock_bh(&dcb_lock); /* features flags */ if (ops->getfeatcfg) { struct nlattr *feat = nla_nest_start_noflag(skb, DCB_ATTR_CEE_FEAT); if (!feat) goto nla_put_failure; for (i = DCB_FEATCFG_ATTR_ALL + 1; i <= DCB_FEATCFG_ATTR_MAX; i++) if (!ops->getfeatcfg(netdev, i, &value) && nla_put_u8(skb, i, value)) goto nla_put_failure; nla_nest_end(skb, feat); } /* peer info if available */ if (ops->cee_peer_getpg) { struct cee_pg pg; memset(&pg, 0, sizeof(pg)); err = ops->cee_peer_getpg(netdev, &pg); if (!err && nla_put(skb, DCB_ATTR_CEE_PEER_PG, sizeof(pg), &pg)) goto nla_put_failure; } if (ops->cee_peer_getpfc) { struct cee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->cee_peer_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_CEE_PEER_PFC, sizeof(pfc), &pfc)) goto nla_put_failure; } if (ops->peer_getappinfo && ops->peer_getapptable) { err = dcbnl_build_peer_app(netdev, skb, DCB_ATTR_CEE_PEER_APP_TABLE, DCB_ATTR_CEE_PEER_APP_INFO, DCB_ATTR_CEE_PEER_APP); if (err) goto nla_put_failure; } nla_nest_end(skb, cee); /* DCBX state */ if (dcbx >= 0) { err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx); if (err) goto nla_put_failure; } return 0; dcb_unlock: spin_unlock_bh(&dcb_lock); nla_put_failure: err = -EMSGSIZE; return err; } static int dcbnl_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid, int dcbx_ver) { struct net *net = dev_net(dev); struct sk_buff *skb; struct nlmsghdr *nlh; const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops; int err; if (!ops) return -EOPNOTSUPP; skb = dcbnl_newmsg(event, cmd, portid, seq, 0, &nlh); if (!skb) return -ENOMEM; if (dcbx_ver == DCB_CAP_DCBX_VER_IEEE) err = dcbnl_ieee_fill(skb, dev); else err = dcbnl_cee_fill(skb, dev); if (err < 0) { /* Report error to broadcast listeners */ nlmsg_free(skb); rtnl_set_sk_err(net, RTNLGRP_DCB, err); } else { /* End nlmsg and notify broadcast listeners */ nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_DCB, NULL, GFP_KERNEL); } return err; } int dcbnl_ieee_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid) { return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_IEEE); } EXPORT_SYMBOL(dcbnl_ieee_notify); int dcbnl_cee_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid) { return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_CEE); } EXPORT_SYMBOL(dcbnl_cee_notify); /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb SET commands. * If any requested operation can not be completed * the entire msg is aborted and error value is returned. * No attempt is made to reconcile the case where only part of the * cmd can be completed. */ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1]; int prio; int err; if (!ops) return -EOPNOTSUPP; if (!tb[DCB_ATTR_IEEE]) return -EINVAL; err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], dcbnl_ieee_policy, NULL); if (err) return err; if (ieee[DCB_ATTR_IEEE_ETS] && ops->ieee_setets) { struct ieee_ets *ets = nla_data(ieee[DCB_ATTR_IEEE_ETS]); err = ops->ieee_setets(netdev, ets); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_MAXRATE] && ops->ieee_setmaxrate) { struct ieee_maxrate *maxrate = nla_data(ieee[DCB_ATTR_IEEE_MAXRATE]); err = ops->ieee_setmaxrate(netdev, maxrate); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_QCN] && ops->ieee_setqcn) { struct ieee_qcn *qcn = nla_data(ieee[DCB_ATTR_IEEE_QCN]); err = ops->ieee_setqcn(netdev, qcn); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) { struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]); err = ops->ieee_setpfc(netdev, pfc); if (err) goto err; } if (ieee[DCB_ATTR_DCB_BUFFER] && ops->dcbnl_setbuffer) { struct dcbnl_buffer *buffer = nla_data(ieee[DCB_ATTR_DCB_BUFFER]); for (prio = 0; prio < ARRAY_SIZE(buffer->prio2buffer); prio++) { if (buffer->prio2buffer[prio] >= DCBX_MAX_BUFFERS) { err = -EINVAL; goto err; } } err = ops->dcbnl_setbuffer(netdev, buffer); if (err) goto err; } if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], netdev, ops->dcbnl_setrewr ?: dcb_setrewr); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], netdev, ops->ieee_setapp ?: dcb_ieee_setapp); if (err) goto err; } if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) { u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0}; struct nlattr *attr; int nselectors = 0; int rem; if (!ops->dcbnl_setapptrust) { err = -EOPNOTSUPP; goto err; } nla_for_each_nested(attr, ieee[DCB_ATTR_DCB_APP_TRUST_TABLE], rem) { enum ieee_attrs_app type = nla_type(attr); u8 selector; int i; if (!dcbnl_app_attr_type_validate(type) || nla_len(attr) != 1 || nselectors >= sizeof(selectors)) { err = -EINVAL; goto err; } selector = nla_get_u8(attr); if (!dcbnl_app_selector_validate(type, selector)) { err = -EINVAL; goto err; } /* Duplicate selector ? */ for (i = 0; i < nselectors; i++) { if (selectors[i] == selector) { err = -EINVAL; goto err; } } selectors[nselectors++] = selector; } err = ops->dcbnl_setapptrust(netdev, selectors, nselectors); if (err) goto err; } err: err = nla_put_u8(skb, DCB_ATTR_IEEE, err); dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, seq, 0); return err; } static int dcbnl_ieee_get(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; if (!ops) return -EOPNOTSUPP; return dcbnl_ieee_fill(skb, netdev); } static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1]; int err; if (!ops) return -EOPNOTSUPP; if (!tb[DCB_ATTR_IEEE]) return -EINVAL; err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], dcbnl_ieee_policy, NULL); if (err) return err; if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], netdev, ops->ieee_delapp ?: dcb_ieee_delapp); if (err) goto err; } if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], netdev, ops->dcbnl_delrewr ?: dcb_delrewr); if (err) goto err; } err: err = nla_put_u8(skb, DCB_ATTR_IEEE, err); dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_DEL, seq, 0); return err; } /* DCBX configuration */ static int dcbnl_getdcbx(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { if (!netdev->dcbnl_ops->getdcbx) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_DCBX, netdev->dcbnl_ops->getdcbx(netdev)); } static int dcbnl_setdcbx(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!netdev->dcbnl_ops->setdcbx) return -EOPNOTSUPP; if (!tb[DCB_ATTR_DCBX]) return -EINVAL; value = nla_get_u8(tb[DCB_ATTR_DCBX]); return nla_put_u8(skb, DCB_ATTR_DCBX, netdev->dcbnl_ops->setdcbx(netdev, value)); } static int dcbnl_getfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1], *nest; u8 value; int ret, i; int getall = 0; if (!netdev->dcbnl_ops->getfeatcfg) return -EOPNOTSUPP; if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_FEATCFG); if (!nest) return -EMSGSIZE; if (data[DCB_FEATCFG_ATTR_ALL]) getall = 1; for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) { if (!getall && !data[i]) continue; ret = netdev->dcbnl_ops->getfeatcfg(netdev, i, &value); if (!ret) ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); goto nla_put_failure; } } nla_nest_end(skb, nest); nla_put_failure: return ret; } static int dcbnl_setfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1]; int ret, i; u8 value; if (!netdev->dcbnl_ops->setfeatcfg) return -ENOTSUPP; if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); if (ret) goto err; for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); ret = netdev->dcbnl_ops->setfeatcfg(netdev, i, value); if (ret) goto err; } err: ret = nla_put_u8(skb, DCB_ATTR_FEATCFG, ret); return ret; } /* Handle CEE DCBX GET commands. */ static int dcbnl_cee_get(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; if (!ops) return -EOPNOTSUPP; return dcbnl_cee_fill(skb, netdev); } struct reply_func { /* reply netlink message type */ int type; /* function to fill message contents */ int (*cb)(struct net_device *, struct nlmsghdr *, u32, struct nlattr **, struct sk_buff *); }; static const struct reply_func reply_funcs[DCB_CMD_MAX+1] = { [DCB_CMD_GSTATE] = { RTM_GETDCB, dcbnl_getstate }, [DCB_CMD_SSTATE] = { RTM_SETDCB, dcbnl_setstate }, [DCB_CMD_PFC_GCFG] = { RTM_GETDCB, dcbnl_getpfccfg }, [DCB_CMD_PFC_SCFG] = { RTM_SETDCB, dcbnl_setpfccfg }, [DCB_CMD_GPERM_HWADDR] = { RTM_GETDCB, dcbnl_getperm_hwaddr }, [DCB_CMD_GCAP] = { RTM_GETDCB, dcbnl_getcap }, [DCB_CMD_GNUMTCS] = { RTM_GETDCB, dcbnl_getnumtcs }, [DCB_CMD_SNUMTCS] = { RTM_SETDCB, dcbnl_setnumtcs }, [DCB_CMD_PFC_GSTATE] = { RTM_GETDCB, dcbnl_getpfcstate }, [DCB_CMD_PFC_SSTATE] = { RTM_SETDCB, dcbnl_setpfcstate }, [DCB_CMD_GAPP] = { RTM_GETDCB, dcbnl_getapp }, [DCB_CMD_SAPP] = { RTM_SETDCB, dcbnl_setapp }, [DCB_CMD_PGTX_GCFG] = { RTM_GETDCB, dcbnl_pgtx_getcfg }, [DCB_CMD_PGTX_SCFG] = { RTM_SETDCB, dcbnl_pgtx_setcfg }, [DCB_CMD_PGRX_GCFG] = { RTM_GETDCB, dcbnl_pgrx_getcfg }, [DCB_CMD_PGRX_SCFG] = { RTM_SETDCB, dcbnl_pgrx_setcfg }, [DCB_CMD_SET_ALL] = { RTM_SETDCB, dcbnl_setall }, [DCB_CMD_BCN_GCFG] = { RTM_GETDCB, dcbnl_bcn_getcfg }, [DCB_CMD_BCN_SCFG] = { RTM_SETDCB, dcbnl_bcn_setcfg }, [DCB_CMD_IEEE_GET] = { RTM_GETDCB, dcbnl_ieee_get }, [DCB_CMD_IEEE_SET] = { RTM_SETDCB, dcbnl_ieee_set }, [DCB_CMD_IEEE_DEL] = { RTM_SETDCB, dcbnl_ieee_del }, [DCB_CMD_GDCBX] = { RTM_GETDCB, dcbnl_getdcbx }, [DCB_CMD_SDCBX] = { RTM_SETDCB, dcbnl_setdcbx }, [DCB_CMD_GFEATCFG] = { RTM_GETDCB, dcbnl_getfeatcfg }, [DCB_CMD_SFEATCFG] = { RTM_SETDCB, dcbnl_setfeatcfg }, [DCB_CMD_CEE_GET] = { RTM_GETDCB, dcbnl_cee_get }, }; static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_device *netdev; struct dcbmsg *dcb = nlmsg_data(nlh); struct nlattr *tb[DCB_ATTR_MAX + 1]; u32 portid = NETLINK_CB(skb).portid; int ret = -EINVAL; struct sk_buff *reply_skb; struct nlmsghdr *reply_nlh = NULL; const struct reply_func *fn; if ((nlh->nlmsg_type == RTM_SETDCB) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse_deprecated(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX, dcbnl_rtnl_policy, extack); if (ret < 0) return ret; if (dcb->cmd > DCB_CMD_MAX) return -EINVAL; /* check if a reply function has been defined for the command */ fn = &reply_funcs[dcb->cmd]; if (!fn->cb) return -EOPNOTSUPP; if (fn->type == RTM_SETDCB && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!tb[DCB_ATTR_IFNAME]) return -EINVAL; netdev = __dev_get_by_name(net, nla_data(tb[DCB_ATTR_IFNAME])); if (!netdev) return -ENODEV; if (!netdev->dcbnl_ops) return -EOPNOTSUPP; reply_skb = dcbnl_newmsg(fn->type, dcb->cmd, portid, nlh->nlmsg_seq, nlh->nlmsg_flags, &reply_nlh); if (!reply_skb) return -ENOMEM; ret = fn->cb(netdev, nlh, nlh->nlmsg_seq, tb, reply_skb); if (ret < 0) { nlmsg_free(reply_skb); goto out; } nlmsg_end(reply_skb, reply_nlh); ret = rtnl_unicast(reply_skb, net, portid); out: return ret; } static struct dcb_app_type *dcb_rewr_lookup(const struct dcb_app *app, int ifindex, int proto) { struct dcb_app_type *itr; list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->app.selector == app->selector && itr->app.priority == app->priority && itr->ifindex == ifindex && ((proto == -1) || itr->app.protocol == proto)) return itr; } return NULL; } static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app, int ifindex, int prio) { struct dcb_app_type *itr; list_for_each_entry(itr, &dcb_app_list, list) { if (itr->app.selector == app->selector && itr->app.protocol == app->protocol && itr->ifindex == ifindex && ((prio == -1) || itr->app.priority == prio)) return itr; } return NULL; } static int dcb_app_add(struct list_head *list, const struct dcb_app *app, int ifindex) { struct dcb_app_type *entry; entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; memcpy(&entry->app, app, sizeof(*app)); entry->ifindex = ifindex; list_add(&entry->list, list); return 0; } /** * dcb_getapp - retrieve the DCBX application user priority * @dev: network interface * @app: application to get user priority of * * On success returns a non-zero 802.1p user priority bitmap * otherwise returns 0 as the invalid user priority bitmap to * indicate an error. */ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u8 prio = 0; spin_lock_bh(&dcb_lock); itr = dcb_app_lookup(app, dev->ifindex, -1); if (itr) prio = itr->app.priority; spin_unlock_bh(&dcb_lock); return prio; } EXPORT_SYMBOL(dcb_getapp); /** * dcb_setapp - add CEE dcb application data to app list * @dev: network interface * @new: application data to add * * Priority 0 is an invalid priority in CEE spec. This routine * removes applications from the app list if the priority is * set to zero. Priority is expected to be 8-bit 802.1p user priority bitmap */ int dcb_setapp(struct net_device *dev, struct dcb_app *new) { struct dcb_app_type *itr; struct dcb_app_type event; int err = 0; event.ifindex = dev->ifindex; memcpy(&event.app, new, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and replace */ itr = dcb_app_lookup(new, dev->ifindex, -1); if (itr) { if (new->priority) itr->app.priority = new->priority; else { list_del(&itr->list); kfree(itr); } goto out; } /* App type does not exist add new application type */ if (new->priority) err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_setapp); /** * dcb_ieee_getapp_mask - retrieve the IEEE DCB application priority * @dev: network interface * @app: where to store the retrieve application data * * Helper routine which on success returns a non-zero 802.1Qaz user * priority bitmap otherwise returns 0 to indicate the dcb_app was * not found in APP list. */ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u8 prio = 0; spin_lock_bh(&dcb_lock); itr = dcb_app_lookup(app, dev->ifindex, -1); if (itr) prio |= 1 << itr->app.priority; spin_unlock_bh(&dcb_lock); return prio; } EXPORT_SYMBOL(dcb_ieee_getapp_mask); /* Get protocol value from rewrite entry. */ u16 dcb_getrewr(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u16 proto = 0; spin_lock_bh(&dcb_lock); itr = dcb_rewr_lookup(app, dev->ifindex, -1); if (itr) proto = itr->app.protocol; spin_unlock_bh(&dcb_lock); return proto; } EXPORT_SYMBOL(dcb_getrewr); /* Add rewrite entry to the rewrite list. */ int dcb_setrewr(struct net_device *dev, struct dcb_app *new) { int err; spin_lock_bh(&dcb_lock); /* Search for existing match and abort if found. */ if (dcb_rewr_lookup(new, dev->ifindex, new->protocol)) { err = -EEXIST; goto out; } err = dcb_app_add(&dcb_rewr_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); return err; } EXPORT_SYMBOL(dcb_setrewr); /* Delete rewrite entry from the rewrite list. */ int dcb_delrewr(struct net_device *dev, struct dcb_app *del) { struct dcb_app_type *itr; int err = -ENOENT; spin_lock_bh(&dcb_lock); /* Search for existing match and remove it. */ itr = dcb_rewr_lookup(del, dev->ifindex, del->protocol); if (itr) { list_del(&itr->list); kfree(itr); err = 0; } spin_unlock_bh(&dcb_lock); return err; } EXPORT_SYMBOL(dcb_delrewr); /** * dcb_ieee_setapp - add IEEE dcb application data to app list * @dev: network interface * @new: application data to add * * This adds Application data to the list. Multiple application * entries may exists for the same selector and protocol as long * as the priorities are different. Priority is expected to be a * 3-bit unsigned integer */ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new) { struct dcb_app_type event; int err = 0; event.ifindex = dev->ifindex; memcpy(&event.app, new, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and abort if found */ if (dcb_app_lookup(new, dev->ifindex, new->priority)) { err = -EEXIST; goto out; } err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_ieee_setapp); /** * dcb_ieee_delapp - delete IEEE dcb application data from list * @dev: network interface * @del: application data to delete * * This removes a matching APP data from the APP list */ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) { struct dcb_app_type *itr; struct dcb_app_type event; int err = -ENOENT; event.ifindex = dev->ifindex; memcpy(&event.app, del, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and remove it. */ if ((itr = dcb_app_lookup(del, dev->ifindex, del->priority))) { list_del(&itr->list); kfree(itr); err = 0; } spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_ieee_delapp); /* dcb_getrewr_prio_pcp_mask_map - For a given device, find mapping from * priorities to the PCP and DEI values assigned to that priority. */ void dcb_getrewr_prio_pcp_mask_map(const struct net_device *dev, struct dcb_rewr_prio_pcp_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == ifindex && itr->app.selector == DCB_APP_SEL_PCP && itr->app.protocol < 16 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1 << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_getrewr_prio_pcp_mask_map); /* dcb_getrewr_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. */ void dcb_getrewr_prio_dscp_mask_map(const struct net_device *dev, struct dcb_ieee_app_prio_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1ULL << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_getrewr_prio_dscp_mask_map); /* * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. Initialize p_map * such that each map element holds a bit mask of DSCP values configured for * that priority by APP entries. */ void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev, struct dcb_ieee_app_prio_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1ULL << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map); /* * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from * DSCP values to the priorities assigned to that DSCP value. Initialize p_map * such that each map element holds a bit mask of priorities configured for a * given DSCP value by APP entries. */ void dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev, struct dcb_ieee_app_dscp_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) p_map->map[itr->app.protocol] |= 1 << itr->app.priority; } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map); /* * Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet * type, with valid PID values >= 1536. A special meaning is then assigned to * protocol value of 0: "default priority. For use when priority is not * otherwise specified". * * dcb_ieee_getapp_default_prio_mask - For a given device, find all APP entries * of the form {$PRIO, ETHERTYPE, 0} and construct a bit mask of all default * priorities set by these entries. */ u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 mask = 0; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE && itr->app.protocol == 0 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) mask |= 1 << itr->app.priority; } spin_unlock_bh(&dcb_lock); return mask; } EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask); static void dcbnl_flush_dev(struct net_device *dev) { struct dcb_app_type *itr, *tmp; spin_lock_bh(&dcb_lock); list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) { if (itr->ifindex == dev->ifindex) { list_del(&itr->list); kfree(itr); } } spin_unlock_bh(&dcb_lock); } static int dcbnl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_UNREGISTER: if (!dev->dcbnl_ops) return NOTIFY_DONE; dcbnl_flush_dev(dev); return NOTIFY_OK; default: return NOTIFY_DONE; } } static struct notifier_block dcbnl_nb __read_mostly = { .notifier_call = dcbnl_netdevice_event, }; static const struct rtnl_msg_handler dcbnl_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_GETDCB, .doit = dcb_doit}, {.msgtype = RTM_SETDCB, .doit = dcb_doit}, }; static int __init dcbnl_init(void) { int err; err = register_netdevice_notifier(&dcbnl_nb); if (err) return err; rtnl_register_many(dcbnl_rtnl_msg_handlers); return 0; } device_initcall(dcbnl_init);
6 5 5 6 6 6 6 6 4 4 2 5 6 5 5 4 3 6 6 6 4 4 3 4 4 4 4 4 4 4 2 2 2 2 2 2 2 2 4 4 4 4 3 4 2 2 2 2 4 4 2 2 2 2 4 2 2 2 4 11 10 10 10 9 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 // SPDX-License-Identifier: GPL-2.0-only /* * kexec.c - kexec system call core code. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/btf.h> #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/liveupdate.h> #include <linux/highmem.h> #include <linux/syscalls.h> #include <linux/reboot.h> #include <linux/ioport.h> #include <linux/hardirq.h> #include <linux/elf.h> #include <linux/elfcore.h> #include <linux/utsname.h> #include <linux/numa.h> #include <linux/suspend.h> #include <linux/device.h> #include <linux/freezer.h> #include <linux/panic_notifier.h> #include <linux/pm.h> #include <linux/cpu.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/console.h> #include <linux/vmalloc.h> #include <linux/swap.h> #include <linux/syscore_ops.h> #include <linux/compiler.h> #include <linux/hugetlb.h> #include <linux/objtool.h> #include <linux/kmsg_dump.h> #include <linux/dma-map-ops.h> #include <linux/sysfs.h> #include <asm/page.h> #include <asm/sections.h> #include <crypto/hash.h> #include "kexec_internal.h" atomic_t __kexec_lock = ATOMIC_INIT(0); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; bool kexec_file_dbg_print; /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors * where you can disable the MMU this is trivial, and easy. For * others it is still a simple predictable page table to setup. * * In that environment kexec copies the new kernel to its final * resting place. This means I can only support memory whose * physical address can fit in an unsigned long. In particular * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. * If the assembly stub has more restrictive requirements * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be * defined more restrictively in <asm/kexec.h>. * * The code for the transition from the current kernel to the * new kernel is placed in the control_code_buffer, whose size * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from * virtual to physical addresses it must live in the range * 0 - TASK_SIZE, as only the user space mappings are arbitrarily * modifiable. * * The assembly stub in the control code buffer is passed a linked list * of descriptor pages detailing the source pages of the new kernel, * and the destination addresses of those source pages. As this data * structure is not used in the context of the current OS, it must * be self-contained. * * The code has been made to work with highmem pages and will use a * destination page in its final resting place (if it happens * to allocate it). The end product of this is that most of the * physical address space, and most of RAM can be used. * * Future directions include: * - allocating a page table with the control code buffer identity * mapped, to simplify machine_kexec and make kexec_on_panic more * reliable. */ /* * KIMAGE_NO_DEST is an impossible destination address..., for * allocating pages whose destination address we do not care about. */ #define KIMAGE_NO_DEST (-1UL) #define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT) static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long dest); int sanity_check_segment_list(struct kimage *image) { int i; unsigned long nr_segments = image->nr_segments; unsigned long total_pages = 0; unsigned long nr_pages = totalram_pages(); /* * Verify we have good destination addresses. The caller is * responsible for making certain we don't attempt to load * the new image into invalid or reserved areas of RAM. This * just verifies it is an address we can use. * * Since the kernel does everything in page size chunks ensure * the destination addresses are page aligned. Too many * special cases crop of when we don't do this. The most * insidious is getting overlapping destination addresses * simply because addresses are changed to page size * granularity. */ for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if (mstart > mend) return -EADDRNOTAVAIL; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) return -EADDRNOTAVAIL; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) return -EADDRNOTAVAIL; } /* Verify our destination addresses do not overlap. * If we alloed overlapping destination addresses * through very weird things can happen with no * easy explanation as one segment stops on another. */ for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; unsigned long j; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; for (j = 0; j < i; j++) { unsigned long pstart, pend; pstart = image->segment[j].mem; pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) return -EINVAL; } } /* Ensure our buffer sizes are strictly less than * our memory sizes. This should always be the case, * and it is easier to check up front than to be surprised * later on. */ for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) return -EINVAL; } /* * Verify that no more than half of memory will be consumed. If the * request from userspace is too large, a large amount of time will be * wasted allocating pages, which can cause a soft lockup. */ for (i = 0; i < nr_segments; i++) { if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2) return -EINVAL; total_pages += PAGE_COUNT(image->segment[i].memsz); } if (total_pages > nr_pages / 2) return -EINVAL; #ifdef CONFIG_CRASH_DUMP /* * Verify we have good destination addresses. Normally * the caller is responsible for making certain we don't * attempt to load the new image into invalid or reserved * areas of RAM. But crash kernels are preloaded into a * reserved area of ram. We must ensure the addresses * are in the reserved area otherwise preloading the * kernel could corrupt things. */ if (image->type == KEXEC_TYPE_CRASH) { for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; /* Ensure we are within the crash kernel limits */ if ((mstart < phys_to_boot_phys(crashk_res.start)) || (mend > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } } #endif /* * The destination addresses are searched from system RAM rather than * being allocated from the buddy allocator, so they are not guaranteed * to be accepted by the current kernel. Accept the destination * addresses before kexec swaps their content with the segments' source * pages to avoid accessing memory before it is accepted. */ for (i = 0; i < nr_segments; i++) accept_memory(image->segment[i].mem, image->segment[i].memsz); return 0; } struct kimage *do_kimage_alloc_init(void) { struct kimage *image; /* Allocate a controlling structure */ image = kzalloc(sizeof(*image), GFP_KERNEL); if (!image) return NULL; image->entry = &image->head; image->last_entry = &image->head; image->control_page = ~0; /* By default this does not apply */ image->type = KEXEC_TYPE_DEFAULT; /* Initialize the list of control pages */ INIT_LIST_HEAD(&image->control_pages); /* Initialize the list of destination pages */ INIT_LIST_HEAD(&image->dest_pages); /* Initialize the list of unusable pages */ INIT_LIST_HEAD(&image->unusable_pages); #ifdef CONFIG_CRASH_HOTPLUG image->hp_action = KEXEC_CRASH_HP_NONE; image->elfcorehdr_index = -1; image->elfcorehdr_updated = false; #endif return image; } int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end) { unsigned long i; for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; if ((end >= mstart) && (start <= mend)) return 1; } return 0; } static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; if (fatal_signal_pending(current)) return NULL; pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); if (pages) { unsigned int count, i; pages->mapping = NULL; set_page_private(pages, order); count = 1 << order; for (i = 0; i < count; i++) SetPageReserved(pages + i); arch_kexec_post_alloc_pages(page_address(pages), count, gfp_mask); if (gfp_mask & __GFP_ZERO) for (i = 0; i < count; i++) clear_highpage(pages + i); } return pages; } static void kimage_free_pages(struct page *page) { unsigned int order, count, i; order = page_private(page); count = 1 << order; arch_kexec_pre_free_pages(page_address(page), count); for (i = 0; i < count; i++) ClearPageReserved(page + i); __free_pages(page, order); } void kimage_free_page_list(struct list_head *list) { struct page *page, *next; list_for_each_entry_safe(page, next, list, lru) { list_del(&page->lru); kimage_free_pages(page); } } static struct page *kimage_alloc_normal_control_pages(struct kimage *image, unsigned int order) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages * to their final resting place. As such they must * not conflict with either the destination addresses * or memory the kernel is already using. * * The only case where we really need more than one of * these are for architectures where we cannot disable * the MMU and must instead generate an identity mapped * page table for all of the memory. * * At worst this runs in O(N) of the image size. */ struct list_head extra_pages; struct page *pages; unsigned int count; count = 1 << order; INIT_LIST_HEAD(&extra_pages); /* Loop while I can allocate a page and the page allocated * is a destination page. */ do { unsigned long pfn, epfn, addr, eaddr; pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); if (!pages) break; pfn = page_to_boot_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; eaddr = (epfn << PAGE_SHIFT) - 1; if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || kimage_is_destination_range(image, addr, eaddr)) { list_add(&pages->lru, &extra_pages); pages = NULL; } } while (!pages); if (pages) { /* Remember the allocated page... */ list_add(&pages->lru, &image->control_pages); /* Because the page is already in it's destination * location we will never allocate another page at * that address. Therefore kimage_alloc_pages * will not return it (again) and we don't need * to give it an entry in image->segment[]. */ } /* Deal with the destination pages I have inadvertently allocated. * * Ideally I would convert multi-page allocations into single * page allocations, and add everything to image->dest_pages. * * For now it is simpler to just free the pages. */ kimage_free_page_list(&extra_pages); return pages; } #ifdef CONFIG_CRASH_DUMP static struct page *kimage_alloc_crash_control_pages(struct kimage *image, unsigned int order) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages * to their final resting place. As such they must * not conflict with either the destination addresses * or memory the kernel is already using. * * Control pages are also the only pags we must allocate * when loading a crash kernel. All of the other pages * are specified by the segments and we just memcpy * into them directly. * * The only case where we really need more than one of * these are for architectures where we cannot disable * the MMU and must instead generate an identity mapped * page table for all of the memory. * * Given the low demand this implements a very simple * allocator that finds the first hole of the appropriate * size in the reserved memory region, and allocates all * of the memory up to and including the hole. */ unsigned long hole_start, hole_end, size; struct page *pages; pages = NULL; size = (1 << order) << PAGE_SHIFT; hole_start = ALIGN(image->control_page, size); hole_end = hole_start + size - 1; while (hole_end <= crashk_res.end) { unsigned long i; cond_resched(); if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) break; /* See if I overlap any of the segments */ for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; if ((hole_end >= mstart) && (hole_start <= mend)) { /* Advance the hole to the end of the segment */ hole_start = ALIGN(mend, size); hole_end = hole_start + size - 1; break; } } /* If I don't overlap any segments I have found my hole! */ if (i == image->nr_segments) { pages = pfn_to_page(hole_start >> PAGE_SHIFT); image->control_page = hole_end + 1; break; } } /* Ensure that these pages are decrypted if SME is enabled. */ if (pages) arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0); return pages; } #endif struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order) { struct page *pages = NULL; switch (image->type) { case KEXEC_TYPE_DEFAULT: pages = kimage_alloc_normal_control_pages(image, order); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: pages = kimage_alloc_crash_control_pages(image, order); break; #endif } return pages; } static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) image->entry++; if (image->entry == image->last_entry) { kimage_entry_t *ind_page; struct page *page; page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); if (!page) return -ENOMEM; ind_page = page_address(page); *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION; image->entry = ind_page; image->last_entry = ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); } *image->entry = entry; image->entry++; *image->entry = 0; return 0; } static int kimage_set_destination(struct kimage *image, unsigned long destination) { destination &= PAGE_MASK; return kimage_add_entry(image, destination | IND_DESTINATION); } static int kimage_add_page(struct kimage *image, unsigned long page) { page &= PAGE_MASK; return kimage_add_entry(image, page | IND_SOURCE); } static void kimage_free_extra_pages(struct kimage *image) { /* Walk through and free any extra destination pages I may have */ kimage_free_page_list(&image->dest_pages); /* Walk through and free any unusable pages I have cached */ kimage_free_page_list(&image->unusable_pages); } void kimage_terminate(struct kimage *image) { if (*image->entry != 0) image->entry++; *image->entry = IND_DONE; } #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ ptr = (entry & IND_INDIRECTION) ? \ boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1) static void kimage_free_entry(kimage_entry_t entry) { struct page *page; page = boot_pfn_to_page(entry >> PAGE_SHIFT); kimage_free_pages(page); } static void kimage_free_cma(struct kimage *image) { unsigned long i; for (i = 0; i < image->nr_segments; i++) { struct page *cma = image->segment_cma[i]; u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT; if (!cma) continue; arch_kexec_pre_free_pages(page_address(cma), nr_pages); dma_release_from_contiguous(NULL, cma, nr_pages); image->segment_cma[i] = NULL; } } void kimage_free(struct kimage *image) { kimage_entry_t *ptr, entry; kimage_entry_t ind = 0; if (!image) return; #ifdef CONFIG_CRASH_DUMP if (image->vmcoreinfo_data_copy) { crash_update_vmcoreinfo_safecopy(NULL); vunmap(image->vmcoreinfo_data_copy); } #endif kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { /* Free the previous indirection page */ if (ind & IND_INDIRECTION) kimage_free_entry(ind); /* Save this indirection page until we are * done with it. */ ind = entry; } else if (entry & IND_SOURCE) kimage_free_entry(entry); } /* Free the final indirection page */ if (ind & IND_INDIRECTION) kimage_free_entry(ind); /* Handle any machine specific cleanup */ machine_kexec_cleanup(image); /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); /* Free CMA allocations */ kimage_free_cma(image); /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. */ if (image->file_mode) kimage_file_post_load_cleanup(image); kfree(image); } static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page) { kimage_entry_t *ptr, entry; unsigned long destination = 0; for_each_kimage_entry(image, ptr, entry) { if (entry & IND_DESTINATION) destination = entry & PAGE_MASK; else if (entry & IND_SOURCE) { if (page == destination) return ptr; destination += PAGE_SIZE; } } return NULL; } static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long destination) { /* * Here we implement safeguards to ensure that a source page * is not copied to its destination page before the data on * the destination page is no longer useful. * * To do this we maintain the invariant that a source page is * either its own destination page, or it is not a * destination page at all. * * That is slightly stronger than required, but the proof * that no problems will not occur is trivial, and the * implementation is simply to verify. * * When allocating all pages normally this algorithm will run * in O(N) time, but in the worst case it will run in O(N^2) * time. If the runtime is a problem the data structures can * be fixed. */ struct page *page; unsigned long addr; /* * Walk through the list of destination pages, and see if I * have a match. */ list_for_each_entry(page, &image->dest_pages, lru) { addr = page_to_boot_pfn(page) << PAGE_SHIFT; if (addr == destination) { list_del(&page->lru); return page; } } page = NULL; while (1) { kimage_entry_t *old; /* Allocate a page, if we run out of memory give up */ page = kimage_alloc_pages(gfp_mask, 0); if (!page) return NULL; /* If the page cannot be used file it away */ if (page_to_boot_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { list_add(&page->lru, &image->unusable_pages); continue; } addr = page_to_boot_pfn(page) << PAGE_SHIFT; /* If it is the destination page we want use it */ if (addr == destination) break; /* If the page is not a destination page use it */ if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE - 1)) break; /* * I know that the page is someones destination page. * See if there is already a source page for this * destination page. And if so swap the source pages. */ old = kimage_dst_used(image, addr); if (old) { /* If so move it */ unsigned long old_addr; struct page *old_page; old_addr = *old & PAGE_MASK; old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT); copy_highpage(page, old_page); *old = addr | (*old & ~PAGE_MASK); /* The old page I have found cannot be a * destination page, so return it if it's * gfp_flags honor the ones passed in. */ if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(old_page)) { kimage_free_pages(old_page); continue; } page = old_page; break; } /* Place the page on the destination list, to be used later */ list_add(&page->lru, &image->dest_pages); } return page; } static int kimage_load_cma_segment(struct kimage *image, int idx) { struct kexec_segment *segment = &image->segment[idx]; struct page *cma = image->segment_cma[idx]; char *ptr = page_address(cma); size_t ubytes, mbytes; int result = 0; unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; if (image->file_mode) kbuf = segment->kbuf; else buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; /* Then copy from source buffer to the CMA one */ while (mbytes) { size_t uchunk, mchunk; mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (uchunk) { /* For file based kexec, source pages are in kernel memory */ if (image->file_mode) memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); ubytes -= uchunk; if (image->file_mode) kbuf += uchunk; else buf += uchunk; } if (result) { result = -EFAULT; goto out; } ptr += mchunk; mbytes -= mchunk; cond_resched(); } /* Clear any remainder */ memset(ptr, 0, mbytes); out: return result; } static int kimage_load_normal_segment(struct kimage *image, int idx) { struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; if (image->file_mode) kbuf = segment->kbuf; else buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; if (image->segment_cma[idx]) return kimage_load_cma_segment(image, idx); result = kimage_set_destination(image, maddr); if (result < 0) goto out; while (mbytes) { struct page *page; char *ptr; size_t uchunk, mchunk; page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); if (!page) { result = -ENOMEM; goto out; } result = kimage_add_page(image, page_to_boot_pfn(page) << PAGE_SHIFT); if (result < 0) goto out; ptr = kmap_local_page(page); /* Start with a clear page */ clear_page(ptr); mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (uchunk) { /* For file based kexec, source pages are in kernel memory */ if (image->file_mode) memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); ubytes -= uchunk; if (image->file_mode) kbuf += uchunk; else buf += uchunk; } kunmap_local(ptr); if (result) { result = -EFAULT; goto out; } maddr += mchunk; mbytes -= mchunk; cond_resched(); } out: return result; } #ifdef CONFIG_CRASH_DUMP static int kimage_load_crash_segment(struct kimage *image, int idx) { /* For crash dumps kernels we simply copy the data from * user space to it's destination. * We do things a page at a time for the sake of kmap. */ struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; result = 0; if (image->file_mode) kbuf = segment->kbuf; else buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; while (mbytes) { struct page *page; char *ptr; size_t uchunk, mchunk; page = boot_pfn_to_page(maddr >> PAGE_SHIFT); if (!page) { result = -ENOMEM; goto out; } arch_kexec_post_alloc_pages(page_address(page), 1, 0); ptr = kmap_local_page(page); mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (mchunk > uchunk) { /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } if (uchunk) { /* For file based kexec, source pages are in kernel memory */ if (image->file_mode) memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); ubytes -= uchunk; if (image->file_mode) kbuf += uchunk; else buf += uchunk; } kexec_flush_icache_page(page); kunmap_local(ptr); arch_kexec_pre_free_pages(page_address(page), 1); if (result) { result = -EFAULT; goto out; } maddr += mchunk; mbytes -= mchunk; cond_resched(); } out: return result; } #endif int kimage_load_segment(struct kimage *image, int idx) { int result = -ENOMEM; switch (image->type) { case KEXEC_TYPE_DEFAULT: result = kimage_load_normal_segment(image, idx); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: result = kimage_load_crash_segment(image, idx); break; #endif } return result; } void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size) { unsigned long src_page_addr, dest_page_addr = 0; unsigned long eaddr = addr + size; kimage_entry_t *ptr, entry; struct page **src_pages; unsigned int npages; void *vaddr = NULL; int i; /* * Collect the source pages and map them in a contiguous VA range. */ npages = PFN_UP(eaddr) - PFN_DOWN(addr); src_pages = kmalloc_array(npages, sizeof(*src_pages), GFP_KERNEL); if (!src_pages) { pr_err("Could not allocate ima pages array.\n"); return NULL; } i = 0; for_each_kimage_entry(image, ptr, entry) { if (entry & IND_DESTINATION) { dest_page_addr = entry & PAGE_MASK; } else if (entry & IND_SOURCE) { if (dest_page_addr >= addr && dest_page_addr < eaddr) { src_page_addr = entry & PAGE_MASK; src_pages[i++] = virt_to_page(__va(src_page_addr)); if (i == npages) break; dest_page_addr += PAGE_SIZE; } } } /* Sanity check. */ WARN_ON(i < npages); vaddr = vmap(src_pages, npages, VM_MAP, PAGE_KERNEL); kfree(src_pages); if (!vaddr) pr_err("Could not map ima buffer.\n"); return vaddr; } void kimage_unmap_segment(void *segment_buffer) { vunmap(segment_buffer); } struct kexec_load_limit { /* Mutex protects the limit count. */ struct mutex mutex; int limit; }; static struct kexec_load_limit load_limit_reboot = { .mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex), .limit = -1, }; static struct kexec_load_limit load_limit_panic = { .mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex), .limit = -1, }; struct kimage *kexec_image; struct kimage *kexec_crash_image; static int kexec_load_disabled; #ifdef CONFIG_SYSCTL static int kexec_limit_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct kexec_load_limit *limit = table->data; int val; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(val), .mode = table->mode, }; int ret; if (write) { ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (ret) return ret; if (val < 0) return -EINVAL; mutex_lock(&limit->mutex); if (limit->limit != -1 && val >= limit->limit) ret = -EINVAL; else limit->limit = val; mutex_unlock(&limit->mutex); return ret; } mutex_lock(&limit->mutex); val = limit->limit; mutex_unlock(&limit->mutex); return proc_dointvec(&tmp, write, buffer, lenp, ppos); } static const struct ctl_table kexec_core_sysctls[] = { { .procname = "kexec_load_disabled", .data = &kexec_load_disabled, .maxlen = sizeof(int), .mode = 0644, /* only handle a transition from default "0" to "1" */ .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_ONE, }, { .procname = "kexec_load_limit_panic", .data = &load_limit_panic, .mode = 0644, .proc_handler = kexec_limit_handler, }, { .procname = "kexec_load_limit_reboot", .data = &load_limit_reboot, .mode = 0644, .proc_handler = kexec_limit_handler, }, }; static int __init kexec_core_sysctl_init(void) { register_sysctl_init("kernel", kexec_core_sysctls); return 0; } late_initcall(kexec_core_sysctl_init); #endif bool kexec_load_permitted(int kexec_image_type) { struct kexec_load_limit *limit; /* * Only the superuser can use the kexec syscall and if it has not * been disabled. */ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) return false; /* Check limit counter and decrease it.*/ limit = (kexec_image_type == KEXEC_TYPE_CRASH) ? &load_limit_panic : &load_limit_reboot; mutex_lock(&limit->mutex); if (!limit->limit) { mutex_unlock(&limit->mutex); return false; } if (limit->limit != -1) limit->limit--; mutex_unlock(&limit->mutex); return true; } /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. */ int kernel_kexec(void) { int error = 0; if (!kexec_trylock()) return -EBUSY; if (!kexec_image) { error = -EINVAL; goto Unlock; } error = liveupdate_reboot(); if (error) goto Unlock; #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { /* * This flow is analogous to hibernation flows that occur * before creating an image and before jumping from the * restore kernel to the image one, so it uses the same * device callbacks as those two flows. */ pm_prepare_console(); error = freeze_processes(); if (error) { error = -EBUSY; goto Restore_console; } console_suspend_all(); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Resume_devices; /* * dpm_suspend_end() must be called after dpm_suspend_start() * to complete the transition, like in the hibernation flows * mentioned above. */ error = dpm_suspend_end(PMSG_FREEZE); if (error) goto Resume_devices; error = suspend_disable_secondary_cpus(); if (error) goto Enable_cpus; local_irq_disable(); error = syscore_suspend(); if (error) goto Enable_irqs; } else #endif { kexec_in_progress = true; kernel_restart_prepare("kexec reboot"); migrate_to_reboot_cpu(); syscore_shutdown(); /* * migrate_to_reboot_cpu() disables CPU hotplug assuming that * no further code needs to use CPU hotplug (which is true in * the reboot case). However, the kexec path depends on using * CPU hotplug again; so re-enable it here. */ cpu_hotplug_enable(); pr_notice("Starting new kernel\n"); machine_shutdown(); } kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_kexec(kexec_image); #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { /* * This flow is analogous to hibernation flows that occur after * creating an image and after the image kernel has got control * back, and in case the devices have been reset or otherwise * manipulated in the meantime, it uses the device callbacks * used by the latter. */ syscore_resume(); Enable_irqs: local_irq_enable(); Enable_cpus: suspend_enable_secondary_cpus(); dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); console_resume_all(); thaw_processes(); Restore_console: pm_restore_console(); } #endif Unlock: kexec_unlock(); return error; } static ssize_t loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", !!kexec_image); } static struct kobj_attribute loaded_attr = __ATTR_RO(loaded); #ifdef CONFIG_CRASH_DUMP static ssize_t crash_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); } static struct kobj_attribute crash_loaded_attr = __ATTR_RO(crash_loaded); #ifdef CONFIG_CRASH_RESERVE static ssize_t crash_cma_ranges_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t len = 0; int i; for (i = 0; i < crashk_cma_cnt; ++i) { len += sysfs_emit_at(buf, len, "%08llx-%08llx\n", crashk_cma_ranges[i].start, crashk_cma_ranges[i].end); } return len; } static struct kobj_attribute crash_cma_ranges_attr = __ATTR_RO(crash_cma_ranges); #endif static ssize_t crash_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t size = crash_get_memory_size(); if (size < 0) return size; return sysfs_emit(buf, "%zd\n", size); } static ssize_t crash_size_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned long cnt; int ret; if (kstrtoul(buf, 0, &cnt)) return -EINVAL; ret = crash_shrink_memory(cnt); return ret < 0 ? ret : count; } static struct kobj_attribute crash_size_attr = __ATTR_RW(crash_size); #ifdef CONFIG_CRASH_HOTPLUG static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned int sz = crash_get_elfcorehdr_size(); return sysfs_emit(buf, "%u\n", sz); } static struct kobj_attribute crash_elfcorehdr_size_attr = __ATTR_RO(crash_elfcorehdr_size); #endif /* CONFIG_CRASH_HOTPLUG */ #endif /* CONFIG_CRASH_DUMP */ static struct attribute *kexec_attrs[] = { &loaded_attr.attr, #ifdef CONFIG_CRASH_DUMP &crash_loaded_attr.attr, &crash_size_attr.attr, #ifdef CONFIG_CRASH_RESERVE &crash_cma_ranges_attr.attr, #endif #ifdef CONFIG_CRASH_HOTPLUG &crash_elfcorehdr_size_attr.attr, #endif #endif NULL }; struct kexec_link_entry { const char *target; const char *name; }; static struct kexec_link_entry kexec_links[] = { { "loaded", "kexec_loaded" }, #ifdef CONFIG_CRASH_DUMP { "crash_loaded", "kexec_crash_loaded" }, { "crash_size", "kexec_crash_size" }, #ifdef CONFIG_CRASH_RESERVE {"crash_cma_ranges", "kexec_crash_cma_ranges"}, #endif #ifdef CONFIG_CRASH_HOTPLUG { "crash_elfcorehdr_size", "crash_elfcorehdr_size" }, #endif #endif }; static struct kobject *kexec_kobj; ATTRIBUTE_GROUPS(kexec); static int __init init_kexec_sysctl(void) { int error; int i; kexec_kobj = kobject_create_and_add("kexec", kernel_kobj); if (!kexec_kobj) { pr_err("failed to create kexec kobject\n"); return -ENOMEM; } error = sysfs_create_groups(kexec_kobj, kexec_groups); if (error) goto kset_exit; for (i = 0; i < ARRAY_SIZE(kexec_links); i++) { error = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, kexec_kobj, kexec_links[i].target, kexec_links[i].name); if (error) pr_err("Unable to create %s symlink (%d)", kexec_links[i].name, error); } return 0; kset_exit: kobject_put(kexec_kobj); return error; } subsys_initcall(init_kexec_sysctl);
1 4 1 4 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2011 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_nat.h> static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; if (mr->rangesize != 1) { pr_info_ratelimited("multiple ranges no longer supported\n"); return -EINVAL; } return nf_ct_netns_get(par->net, par->family); } static int xt_nat_checkentry(const struct xt_tgchk_param *par) { return nf_ct_netns_get(par->net, par->family); } static void xt_nat_destroy(const struct xt_tgdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static void xt_nat_convert_range(struct nf_nat_range2 *dst, const struct nf_nat_ipv4_range *src) { memset(&dst->min_addr, 0, sizeof(dst->min_addr)); memset(&dst->max_addr, 0, sizeof(dst->max_addr)); memset(&dst->base_proto, 0, sizeof(dst->base_proto)); dst->flags = src->flags; dst->min_addr.ip = src->min_ip; dst->max_addr.ip = src->max_ip; dst->min_proto = src->min; dst->max_proto = src->max; } static unsigned int xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); xt_nat_convert_range(&range, &mr->range[0]); return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); } static unsigned int xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); xt_nat_convert_range(&range, &mr->range[0]); return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); } static unsigned int xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_range *range_v1 = par->targinfo; struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); memcpy(&range, range_v1, sizeof(*range_v1)); memset(&range.base_proto, 0, sizeof(range.base_proto)); return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); } static unsigned int xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_range *range_v1 = par->targinfo; struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); memcpy(&range, range_v1, sizeof(*range_v1)); memset(&range.base_proto, 0, sizeof(range.base_proto)); return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); } static unsigned int xt_snat_target_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_range2 *range = par->targinfo; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC); } static unsigned int xt_dnat_target_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_range2 *range = par->targinfo; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); return nf_nat_setup_info(ct, range, NF_NAT_MANIP_DST); } static struct xt_target xt_nat_target_reg[] __read_mostly = { { .name = "SNAT", .revision = 0, .checkentry = xt_nat_checkentry_v0, .destroy = xt_nat_destroy, .target = xt_snat_target_v0, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .family = NFPROTO_IPV4, .table = "nat", .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, { .name = "DNAT", .revision = 0, .checkentry = xt_nat_checkentry_v0, .destroy = xt_nat_destroy, .target = xt_dnat_target_v0, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .family = NFPROTO_IPV4, .table = "nat", .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, { .name = "SNAT", .revision = 1, .checkentry = xt_nat_checkentry, .destroy = xt_nat_destroy, .target = xt_snat_target_v1, .targetsize = sizeof(struct nf_nat_range), .table = "nat", .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, { .name = "DNAT", .revision = 1, .checkentry = xt_nat_checkentry, .destroy = xt_nat_destroy, .target = xt_dnat_target_v1, .targetsize = sizeof(struct nf_nat_range), .table = "nat", .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, { .name = "SNAT", .revision = 2, .checkentry = xt_nat_checkentry, .destroy = xt_nat_destroy, .target = xt_snat_target_v2, .targetsize = sizeof(struct nf_nat_range2), .table = "nat", .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, { .name = "DNAT", .revision = 2, .checkentry = xt_nat_checkentry, .destroy = xt_nat_destroy, .target = xt_dnat_target_v2, .targetsize = sizeof(struct nf_nat_range2), .table = "nat", .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, }; static int __init xt_nat_init(void) { return xt_register_targets(xt_nat_target_reg, ARRAY_SIZE(xt_nat_target_reg)); } static void __exit xt_nat_exit(void) { xt_unregister_targets(xt_nat_target_reg, ARRAY_SIZE(xt_nat_target_reg)); } module_init(xt_nat_init); module_exit(xt_nat_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS("ipt_SNAT"); MODULE_ALIAS("ipt_DNAT"); MODULE_ALIAS("ip6t_SNAT"); MODULE_ALIAS("ip6t_DNAT"); MODULE_DESCRIPTION("SNAT and DNAT targets support");
2 2 1 1 1 1 3 3 3 1 2 2 2 2 2 2 2 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 // SPDX-License-Identifier: GPL-2.0+ /* * drivers/usb/class/usbtmc.c - USB Test & Measurement class driver * * Copyright (C) 2007 Stefan Kopp, Gechingen, Germany * Copyright (C) 2008 Novell, Inc. * Copyright (C) 2008 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (C) 2018 IVI Foundation, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/kref.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/mutex.h> #include <linux/usb.h> #include <linux/compat.h> #include <linux/usb/tmc.h> /* Increment API VERSION when changing tmc.h with new flags or ioctls * or when changing a significant behavior of the driver. */ #define USBTMC_API_VERSION (3) #define USBTMC_HEADER_SIZE 12 #define USBTMC_MINOR_BASE 176 /* Minimum USB timeout (in milliseconds) */ #define USBTMC_MIN_TIMEOUT 100 /* Default USB timeout (in milliseconds) */ #define USBTMC_TIMEOUT 5000 /* Max number of urbs used in write transfers */ #define MAX_URBS_IN_FLIGHT 16 /* I/O buffer size used in generic read/write functions */ #define USBTMC_BUFSIZE (4096) /* * Maximum number of read cycles to empty bulk in endpoint during CLEAR and * ABORT_BULK_IN requests. Ends the loop if (for whatever reason) a short * packet is never read. */ #define USBTMC_MAX_READS_TO_CLEAR_BULK_IN 100 static const struct usb_device_id usbtmc_devices[] = { { USB_INTERFACE_INFO(USB_CLASS_APP_SPEC, 3, 0), }, { USB_INTERFACE_INFO(USB_CLASS_APP_SPEC, 3, 1), }, { 0, } /* terminating entry */ }; MODULE_DEVICE_TABLE(usb, usbtmc_devices); /* * This structure is the capabilities for the device * See section 4.2.1.8 of the USBTMC specification, * and section 4.2.2 of the USBTMC usb488 subclass * specification for details. */ struct usbtmc_dev_capabilities { __u8 interface_capabilities; __u8 device_capabilities; __u8 usb488_interface_capabilities; __u8 usb488_device_capabilities; }; /* This structure holds private data for each USBTMC device. One copy is * allocated for each USBTMC device in the driver's probe function. */ struct usbtmc_device_data { const struct usb_device_id *id; struct usb_device *usb_dev; struct usb_interface *intf; struct list_head file_list; unsigned int bulk_in; unsigned int bulk_out; u8 bTag; u8 bTag_last_write; /* needed for abort */ u8 bTag_last_read; /* needed for abort */ /* packet size of IN bulk */ u16 wMaxPacketSize; /* data for interrupt in endpoint handling */ u8 bNotify1; u8 bNotify2; u16 ifnum; u8 iin_bTag; u8 *iin_buffer; atomic_t iin_data_valid; unsigned int iin_ep; int iin_ep_present; int iin_interval; struct urb *iin_urb; u16 iin_wMaxPacketSize; /* coalesced usb488_caps from usbtmc_dev_capabilities */ __u8 usb488_caps; bool zombie; /* fd of disconnected device */ struct usbtmc_dev_capabilities capabilities; struct kref kref; struct mutex io_mutex; /* only one i/o function running at a time */ wait_queue_head_t waitq; struct fasync_struct *fasync; spinlock_t dev_lock; /* lock for file_list */ }; #define to_usbtmc_data(d) container_of(d, struct usbtmc_device_data, kref) /* * This structure holds private data for each USBTMC file handle. */ struct usbtmc_file_data { struct usbtmc_device_data *data; struct list_head file_elem; u32 timeout; u8 srq_byte; atomic_t srq_asserted; atomic_t closing; u8 bmTransferAttributes; /* member of DEV_DEP_MSG_IN */ u8 eom_val; u8 term_char; bool term_char_enabled; bool auto_abort; spinlock_t err_lock; /* lock for errors */ struct usb_anchor submitted; /* data for generic_write */ struct semaphore limit_write_sem; u32 out_transfer_size; int out_status; /* data for generic_read */ u32 in_transfer_size; int in_status; int in_urbs_used; struct usb_anchor in_anchor; wait_queue_head_t wait_bulk_in; }; /* Forward declarations */ static struct usb_driver usbtmc_driver; static void usbtmc_draw_down(struct usbtmc_file_data *file_data); static void usbtmc_delete(struct kref *kref) { struct usbtmc_device_data *data = to_usbtmc_data(kref); usb_put_dev(data->usb_dev); kfree(data); } static int usbtmc_open(struct inode *inode, struct file *filp) { struct usb_interface *intf; struct usbtmc_device_data *data; struct usbtmc_file_data *file_data; intf = usb_find_interface(&usbtmc_driver, iminor(inode)); if (!intf) { pr_err("can not find device for minor %d", iminor(inode)); return -ENODEV; } file_data = kzalloc(sizeof(*file_data), GFP_KERNEL); if (!file_data) return -ENOMEM; spin_lock_init(&file_data->err_lock); sema_init(&file_data->limit_write_sem, MAX_URBS_IN_FLIGHT); init_usb_anchor(&file_data->submitted); init_usb_anchor(&file_data->in_anchor); init_waitqueue_head(&file_data->wait_bulk_in); data = usb_get_intfdata(intf); /* Protect reference to data from file structure until release */ kref_get(&data->kref); mutex_lock(&data->io_mutex); file_data->data = data; atomic_set(&file_data->closing, 0); file_data->timeout = USBTMC_TIMEOUT; file_data->term_char = '\n'; file_data->term_char_enabled = 0; file_data->auto_abort = 0; file_data->eom_val = 1; INIT_LIST_HEAD(&file_data->file_elem); spin_lock_irq(&data->dev_lock); list_add_tail(&file_data->file_elem, &data->file_list); spin_unlock_irq(&data->dev_lock); mutex_unlock(&data->io_mutex); /* Store pointer in file structure's private data field */ filp->private_data = file_data; return 0; } /* * usbtmc_flush - called before file handle is closed */ static int usbtmc_flush(struct file *file, fl_owner_t id) { struct usbtmc_file_data *file_data; struct usbtmc_device_data *data; file_data = file->private_data; if (file_data == NULL) return -ENODEV; atomic_set(&file_data->closing, 1); data = file_data->data; /* wait for io to stop */ mutex_lock(&data->io_mutex); usbtmc_draw_down(file_data); spin_lock_irq(&file_data->err_lock); file_data->in_status = 0; file_data->in_transfer_size = 0; file_data->in_urbs_used = 0; file_data->out_status = 0; file_data->out_transfer_size = 0; spin_unlock_irq(&file_data->err_lock); wake_up_interruptible_all(&data->waitq); mutex_unlock(&data->io_mutex); return 0; } static int usbtmc_release(struct inode *inode, struct file *file) { struct usbtmc_file_data *file_data = file->private_data; /* prevent IO _AND_ usbtmc_interrupt */ mutex_lock(&file_data->data->io_mutex); spin_lock_irq(&file_data->data->dev_lock); list_del(&file_data->file_elem); spin_unlock_irq(&file_data->data->dev_lock); mutex_unlock(&file_data->data->io_mutex); kref_put(&file_data->data->kref, usbtmc_delete); file_data->data = NULL; kfree(file_data); return 0; } static int usbtmc_ioctl_abort_bulk_in_tag(struct usbtmc_device_data *data, u8 tag) { u8 *buffer; struct device *dev; int rv; int n; int actual; dev = &data->intf->dev; buffer = kmalloc(USBTMC_BUFSIZE, GFP_KERNEL); if (!buffer) return -ENOMEM; rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_INITIATE_ABORT_BULK_IN, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_ENDPOINT, tag, data->bulk_in, buffer, 2, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } dev_dbg(dev, "INITIATE_ABORT_BULK_IN returned %x with tag %02x\n", buffer[0], buffer[1]); if (buffer[0] == USBTMC_STATUS_FAILED) { /* No transfer in progress and the Bulk-OUT FIFO is empty. */ rv = 0; goto exit; } if (buffer[0] == USBTMC_STATUS_TRANSFER_NOT_IN_PROGRESS) { /* The device returns this status if either: * - There is a transfer in progress, but the specified bTag * does not match. * - There is no transfer in progress, but the Bulk-OUT FIFO * is not empty. */ rv = -ENOMSG; goto exit; } if (buffer[0] != USBTMC_STATUS_SUCCESS) { dev_err(dev, "INITIATE_ABORT_BULK_IN returned %x\n", buffer[0]); rv = -EPERM; goto exit; } n = 0; usbtmc_abort_bulk_in_status: dev_dbg(dev, "Reading from bulk in EP\n"); /* Data must be present. So use low timeout 300 ms */ actual = 0; rv = usb_bulk_msg(data->usb_dev, usb_rcvbulkpipe(data->usb_dev, data->bulk_in), buffer, USBTMC_BUFSIZE, &actual, 300); print_hex_dump_debug("usbtmc ", DUMP_PREFIX_NONE, 16, 1, buffer, actual, true); n++; if (rv < 0) { dev_err(dev, "usb_bulk_msg returned %d\n", rv); if (rv != -ETIMEDOUT) goto exit; } if (actual == USBTMC_BUFSIZE) goto usbtmc_abort_bulk_in_status; if (n >= USBTMC_MAX_READS_TO_CLEAR_BULK_IN) { dev_err(dev, "Couldn't clear device buffer within %d cycles\n", USBTMC_MAX_READS_TO_CLEAR_BULK_IN); rv = -EPERM; goto exit; } rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_CHECK_ABORT_BULK_IN_STATUS, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_ENDPOINT, 0, data->bulk_in, buffer, 0x08, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } dev_dbg(dev, "CHECK_ABORT_BULK_IN returned %x\n", buffer[0]); if (buffer[0] == USBTMC_STATUS_SUCCESS) { rv = 0; goto exit; } if (buffer[0] != USBTMC_STATUS_PENDING) { dev_err(dev, "CHECK_ABORT_BULK_IN returned %x\n", buffer[0]); rv = -EPERM; goto exit; } if ((buffer[1] & 1) > 0) { /* The device has 1 or more queued packets the Host can read */ goto usbtmc_abort_bulk_in_status; } /* The Host must send CHECK_ABORT_BULK_IN_STATUS at a later time. */ rv = -EAGAIN; exit: kfree(buffer); return rv; } static int usbtmc_ioctl_abort_bulk_in(struct usbtmc_device_data *data) { return usbtmc_ioctl_abort_bulk_in_tag(data, data->bTag_last_read); } static int usbtmc_ioctl_abort_bulk_out_tag(struct usbtmc_device_data *data, u8 tag) { struct device *dev; u8 *buffer; int rv; int n; dev = &data->intf->dev; buffer = kmalloc(8, GFP_KERNEL); if (!buffer) return -ENOMEM; rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_INITIATE_ABORT_BULK_OUT, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_ENDPOINT, tag, data->bulk_out, buffer, 2, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } dev_dbg(dev, "INITIATE_ABORT_BULK_OUT returned %x\n", buffer[0]); if (buffer[0] != USBTMC_STATUS_SUCCESS) { dev_err(dev, "INITIATE_ABORT_BULK_OUT returned %x\n", buffer[0]); rv = -EPERM; goto exit; } n = 0; usbtmc_abort_bulk_out_check_status: /* do not stress device with subsequent requests */ msleep(50); rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_CHECK_ABORT_BULK_OUT_STATUS, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_ENDPOINT, 0, data->bulk_out, buffer, 0x08, USB_CTRL_GET_TIMEOUT); n++; if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } dev_dbg(dev, "CHECK_ABORT_BULK_OUT returned %x\n", buffer[0]); if (buffer[0] == USBTMC_STATUS_SUCCESS) goto usbtmc_abort_bulk_out_clear_halt; if ((buffer[0] == USBTMC_STATUS_PENDING) && (n < USBTMC_MAX_READS_TO_CLEAR_BULK_IN)) goto usbtmc_abort_bulk_out_check_status; rv = -EPERM; goto exit; usbtmc_abort_bulk_out_clear_halt: rv = usb_clear_halt(data->usb_dev, usb_sndbulkpipe(data->usb_dev, data->bulk_out)); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } rv = 0; exit: kfree(buffer); return rv; } static int usbtmc_ioctl_abort_bulk_out(struct usbtmc_device_data *data) { return usbtmc_ioctl_abort_bulk_out_tag(data, data->bTag_last_write); } static int usbtmc_get_stb(struct usbtmc_file_data *file_data, __u8 *stb) { struct usbtmc_device_data *data = file_data->data; struct device *dev = &data->intf->dev; u8 *buffer; u8 tag; int rv; long wait_rv; unsigned long expire; dev_dbg(dev, "Enter ioctl_read_stb iin_ep_present: %d\n", data->iin_ep_present); buffer = kmalloc(8, GFP_KERNEL); if (!buffer) return -ENOMEM; atomic_set(&data->iin_data_valid, 0); rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC488_REQUEST_READ_STATUS_BYTE, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, data->iin_bTag, data->ifnum, buffer, 0x03, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "stb usb_control_msg returned %d\n", rv); goto exit; } if (buffer[0] != USBTMC_STATUS_SUCCESS) { dev_err(dev, "control status returned %x\n", buffer[0]); rv = -EIO; goto exit; } if (data->iin_ep_present) { expire = msecs_to_jiffies(file_data->timeout); wait_rv = wait_event_interruptible_timeout( data->waitq, atomic_read(&data->iin_data_valid) != 0, expire); if (wait_rv < 0) { dev_dbg(dev, "wait interrupted %ld\n", wait_rv); rv = wait_rv; goto exit; } if (wait_rv == 0) { dev_dbg(dev, "wait timed out\n"); rv = -ETIMEDOUT; goto exit; } tag = data->bNotify1 & 0x7f; if (tag != data->iin_bTag) { dev_err(dev, "expected bTag %x got %x\n", data->iin_bTag, tag); } *stb = data->bNotify2; } else { *stb = buffer[2]; } dev_dbg(dev, "stb:0x%02x received %d\n", (unsigned int)*stb, rv); rv = 0; exit: /* bump interrupt bTag */ data->iin_bTag += 1; if (data->iin_bTag > 127) /* 1 is for SRQ see USBTMC-USB488 subclass spec section 4.3.1 */ data->iin_bTag = 2; kfree(buffer); return rv; } static int usbtmc488_ioctl_read_stb(struct usbtmc_file_data *file_data, void __user *arg) { int srq_asserted = 0; __u8 stb; int rv; rv = usbtmc_get_stb(file_data, &stb); if (rv < 0) return rv; srq_asserted = atomic_xchg(&file_data->srq_asserted, srq_asserted); if (srq_asserted) stb |= 0x40; /* Set RQS bit */ rv = put_user(stb, (__u8 __user *)arg); return rv; } static int usbtmc_ioctl_get_srq_stb(struct usbtmc_file_data *file_data, void __user *arg) { struct usbtmc_device_data *data = file_data->data; struct device *dev = &data->intf->dev; int srq_asserted = 0; __u8 stb = 0; int rv; spin_lock_irq(&data->dev_lock); srq_asserted = atomic_xchg(&file_data->srq_asserted, srq_asserted); if (srq_asserted) { stb = file_data->srq_byte; spin_unlock_irq(&data->dev_lock); rv = put_user(stb, (__u8 __user *)arg); } else { spin_unlock_irq(&data->dev_lock); rv = -ENOMSG; } dev_dbg(dev, "stb:0x%02x with srq received %d\n", (unsigned int)stb, rv); return rv; } static int usbtmc488_ioctl_wait_srq(struct usbtmc_file_data *file_data, __u32 __user *arg) { struct usbtmc_device_data *data = file_data->data; struct device *dev = &data->intf->dev; u32 timeout; unsigned long expire; long wait_rv; if (!data->iin_ep_present) { dev_dbg(dev, "no interrupt endpoint present\n"); return -EFAULT; } if (get_user(timeout, arg)) return -EFAULT; expire = msecs_to_jiffies(timeout); mutex_unlock(&data->io_mutex); wait_rv = wait_event_interruptible_timeout( data->waitq, atomic_read(&file_data->srq_asserted) != 0 || atomic_read(&file_data->closing), expire); mutex_lock(&data->io_mutex); /* Note! disconnect or close could be called in the meantime */ if (atomic_read(&file_data->closing) || data->zombie) return -ENODEV; if (wait_rv < 0) { dev_dbg(dev, "%s - wait interrupted %ld\n", __func__, wait_rv); return wait_rv; } if (wait_rv == 0) { dev_dbg(dev, "%s - wait timed out\n", __func__); return -ETIMEDOUT; } dev_dbg(dev, "%s - srq asserted\n", __func__); return 0; } static int usbtmc488_ioctl_simple(struct usbtmc_device_data *data, void __user *arg, unsigned int cmd) { struct device *dev = &data->intf->dev; __u8 val; u8 *buffer; u16 wValue; int rv; if (!(data->usb488_caps & USBTMC488_CAPABILITY_SIMPLE)) return -EINVAL; buffer = kmalloc(8, GFP_KERNEL); if (!buffer) return -ENOMEM; if (cmd == USBTMC488_REQUEST_REN_CONTROL) { rv = copy_from_user(&val, arg, sizeof(val)); if (rv) { rv = -EFAULT; goto exit; } wValue = val ? 1 : 0; } else { wValue = 0; } rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), cmd, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, wValue, data->ifnum, buffer, 0x01, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "simple usb_control_msg failed %d\n", rv); goto exit; } else if (rv != 1) { dev_warn(dev, "simple usb_control_msg returned %d\n", rv); rv = -EIO; goto exit; } if (buffer[0] != USBTMC_STATUS_SUCCESS) { dev_err(dev, "simple control status returned %x\n", buffer[0]); rv = -EIO; goto exit; } rv = 0; exit: kfree(buffer); return rv; } /* * Sends a TRIGGER Bulk-OUT command message * See the USBTMC-USB488 specification, Table 2. * * Also updates bTag_last_write. */ static int usbtmc488_ioctl_trigger(struct usbtmc_file_data *file_data) { struct usbtmc_device_data *data = file_data->data; int retval; u8 *buffer; int actual; buffer = kzalloc(USBTMC_HEADER_SIZE, GFP_KERNEL); if (!buffer) return -ENOMEM; buffer[0] = 128; buffer[1] = data->bTag; buffer[2] = ~data->bTag; retval = usb_bulk_msg(data->usb_dev, usb_sndbulkpipe(data->usb_dev, data->bulk_out), buffer, USBTMC_HEADER_SIZE, &actual, file_data->timeout); /* Store bTag (in case we need to abort) */ data->bTag_last_write = data->bTag; /* Increment bTag -- and increment again if zero */ data->bTag++; if (!data->bTag) data->bTag++; kfree(buffer); if (retval < 0) { dev_err(&data->intf->dev, "%s returned %d\n", __func__, retval); return retval; } return 0; } static struct urb *usbtmc_create_urb(void) { const size_t bufsize = USBTMC_BUFSIZE; u8 *dmabuf = NULL; struct urb *urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) return NULL; dmabuf = kzalloc(bufsize, GFP_KERNEL); if (!dmabuf) { usb_free_urb(urb); return NULL; } urb->transfer_buffer = dmabuf; urb->transfer_buffer_length = bufsize; urb->transfer_flags |= URB_FREE_BUFFER; return urb; } static void usbtmc_read_bulk_cb(struct urb *urb) { struct usbtmc_file_data *file_data = urb->context; int status = urb->status; unsigned long flags; /* sync/async unlink faults aren't errors */ if (status) { if (!(/* status == -ENOENT || */ status == -ECONNRESET || status == -EREMOTEIO || /* Short packet */ status == -ESHUTDOWN)) dev_err(&file_data->data->intf->dev, "%s - nonzero read bulk status received: %d\n", __func__, status); spin_lock_irqsave(&file_data->err_lock, flags); if (!file_data->in_status) file_data->in_status = status; spin_unlock_irqrestore(&file_data->err_lock, flags); } spin_lock_irqsave(&file_data->err_lock, flags); file_data->in_transfer_size += urb->actual_length; dev_dbg(&file_data->data->intf->dev, "%s - total size: %u current: %d status: %d\n", __func__, file_data->in_transfer_size, urb->actual_length, status); spin_unlock_irqrestore(&file_data->err_lock, flags); usb_anchor_urb(urb, &file_data->in_anchor); wake_up_interruptible(&file_data->wait_bulk_in); wake_up_interruptible(&file_data->data->waitq); } static inline bool usbtmc_do_transfer(struct usbtmc_file_data *file_data) { bool data_or_error; spin_lock_irq(&file_data->err_lock); data_or_error = !usb_anchor_empty(&file_data->in_anchor) || file_data->in_status; spin_unlock_irq(&file_data->err_lock); dev_dbg(&file_data->data->intf->dev, "%s: returns %d\n", __func__, data_or_error); return data_or_error; } static ssize_t usbtmc_generic_read(struct usbtmc_file_data *file_data, void __user *user_buffer, u32 transfer_size, u32 *transferred, u32 flags) { struct usbtmc_device_data *data = file_data->data; struct device *dev = &data->intf->dev; u32 done = 0; u32 remaining; const u32 bufsize = USBTMC_BUFSIZE; int retval = 0; u32 max_transfer_size; unsigned long expire; int bufcount = 1; int again = 0; long wait_rv; /* mutex already locked */ *transferred = done; max_transfer_size = transfer_size; if (flags & USBTMC_FLAG_IGNORE_TRAILER) { /* The device may send extra alignment bytes (up to * wMaxPacketSize – 1) to avoid sending a zero-length * packet */ remaining = transfer_size; if ((max_transfer_size % data->wMaxPacketSize) == 0) max_transfer_size += (data->wMaxPacketSize - 1); } else { /* round down to bufsize to avoid truncated data left */ if (max_transfer_size > bufsize) { max_transfer_size = roundup(max_transfer_size + 1 - bufsize, bufsize); } remaining = max_transfer_size; } spin_lock_irq(&file_data->err_lock); if (file_data->in_status) { /* return the very first error */ retval = file_data->in_status; spin_unlock_irq(&file_data->err_lock); goto error; } if (flags & USBTMC_FLAG_ASYNC) { if (usb_anchor_empty(&file_data->in_anchor)) again = 1; if (file_data->in_urbs_used == 0) { file_data->in_transfer_size = 0; file_data->in_status = 0; } } else { file_data->in_transfer_size = 0; file_data->in_status = 0; } if (max_transfer_size == 0) { bufcount = 0; } else { bufcount = roundup(max_transfer_size, bufsize) / bufsize; if (bufcount > file_data->in_urbs_used) bufcount -= file_data->in_urbs_used; else bufcount = 0; if (bufcount + file_data->in_urbs_used > MAX_URBS_IN_FLIGHT) { bufcount = MAX_URBS_IN_FLIGHT - file_data->in_urbs_used; } } spin_unlock_irq(&file_data->err_lock); dev_dbg(dev, "%s: requested=%u flags=0x%X size=%u bufs=%d used=%d\n", __func__, transfer_size, flags, max_transfer_size, bufcount, file_data->in_urbs_used); while (bufcount > 0) { u8 *dmabuf = NULL; struct urb *urb = usbtmc_create_urb(); if (!urb) { retval = -ENOMEM; goto error; } dmabuf = urb->transfer_buffer; usb_fill_bulk_urb(urb, data->usb_dev, usb_rcvbulkpipe(data->usb_dev, data->bulk_in), dmabuf, bufsize, usbtmc_read_bulk_cb, file_data); usb_anchor_urb(urb, &file_data->submitted); retval = usb_submit_urb(urb, GFP_KERNEL); /* urb is anchored. We can release our reference. */ usb_free_urb(urb); if (unlikely(retval)) { usb_unanchor_urb(urb); goto error; } file_data->in_urbs_used++; bufcount--; } if (again) { dev_dbg(dev, "%s: ret=again\n", __func__); return -EAGAIN; } if (user_buffer == NULL) return -EINVAL; expire = msecs_to_jiffies(file_data->timeout); while (max_transfer_size > 0) { u32 this_part; struct urb *urb = NULL; if (!(flags & USBTMC_FLAG_ASYNC)) { dev_dbg(dev, "%s: before wait time %lu\n", __func__, expire); wait_rv = wait_event_interruptible_timeout( file_data->wait_bulk_in, usbtmc_do_transfer(file_data), expire); dev_dbg(dev, "%s: wait returned %ld\n", __func__, wait_rv); if (wait_rv < 0) { retval = wait_rv; goto error; } if (wait_rv == 0) { retval = -ETIMEDOUT; goto error; } } urb = usb_get_from_anchor(&file_data->in_anchor); if (!urb) { if (!(flags & USBTMC_FLAG_ASYNC)) { /* synchronous case: must not happen */ retval = -EFAULT; goto error; } /* asynchronous case: ready, do not block or wait */ *transferred = done; dev_dbg(dev, "%s: (async) done=%u ret=0\n", __func__, done); return 0; } file_data->in_urbs_used--; if (max_transfer_size > urb->actual_length) max_transfer_size -= urb->actual_length; else max_transfer_size = 0; if (remaining > urb->actual_length) this_part = urb->actual_length; else this_part = remaining; print_hex_dump_debug("usbtmc ", DUMP_PREFIX_NONE, 16, 1, urb->transfer_buffer, urb->actual_length, true); if (copy_to_user(user_buffer + done, urb->transfer_buffer, this_part)) { usb_free_urb(urb); retval = -EFAULT; goto error; } remaining -= this_part; done += this_part; spin_lock_irq(&file_data->err_lock); if (urb->status) { /* return the very first error */ retval = file_data->in_status; spin_unlock_irq(&file_data->err_lock); usb_free_urb(urb); goto error; } spin_unlock_irq(&file_data->err_lock); if (urb->actual_length < bufsize) { /* short packet or ZLP received => ready */ usb_free_urb(urb); retval = 1; break; } if (!(flags & USBTMC_FLAG_ASYNC) && max_transfer_size > (bufsize * file_data->in_urbs_used)) { /* resubmit, since other buffers still not enough */ usb_anchor_urb(urb, &file_data->submitted); retval = usb_submit_urb(urb, GFP_KERNEL); if (unlikely(retval)) { usb_unanchor_urb(urb); usb_free_urb(urb); goto error; } file_data->in_urbs_used++; } usb_free_urb(urb); retval = 0; } error: *transferred = done; dev_dbg(dev, "%s: before kill\n", __func__); /* Attention: killing urbs can take long time (2 ms) */ usb_kill_anchored_urbs(&file_data->submitted); dev_dbg(dev, "%s: after kill\n", __func__); usb_scuttle_anchored_urbs(&file_data->in_anchor); file_data->in_urbs_used = 0; file_data->in_status = 0; /* no spinlock needed here */ dev_dbg(dev, "%s: done=%u ret=%d\n", __func__, done, retval); return retval; } static ssize_t usbtmc_ioctl_generic_read(struct usbtmc_file_data *file_data, void __user *arg) { struct usbtmc_message msg; ssize_t retval = 0; /* mutex already locked */ if (copy_from_user(&msg, arg, sizeof(struct usbtmc_message))) return -EFAULT; retval = usbtmc_generic_read(file_data, msg.message, msg.transfer_size, &msg.transferred, msg.flags); if (put_user(msg.transferred, &((struct usbtmc_message __user *)arg)->transferred)) return -EFAULT; return retval; } static void usbtmc_write_bulk_cb(struct urb *urb) { struct usbtmc_file_data *file_data = urb->context; int wakeup = 0; unsigned long flags; spin_lock_irqsave(&file_data->err_lock, flags); file_data->out_transfer_size += urb->actual_length; /* sync/async unlink faults aren't errors */ if (urb->status) { if (!(urb->status == -ENOENT || urb->status == -ECONNRESET || urb->status == -ESHUTDOWN)) dev_err(&file_data->data->intf->dev, "%s - nonzero write bulk status received: %d\n", __func__, urb->status); if (!file_data->out_status) { file_data->out_status = urb->status; wakeup = 1; } } spin_unlock_irqrestore(&file_data->err_lock, flags); dev_dbg(&file_data->data->intf->dev, "%s - write bulk total size: %u\n", __func__, file_data->out_transfer_size); up(&file_data->limit_write_sem); if (usb_anchor_empty(&file_data->submitted) || wakeup) wake_up_interruptible(&file_data->data->waitq); } static ssize_t usbtmc_generic_write(struct usbtmc_file_data *file_data, const void __user *user_buffer, u32 transfer_size, u32 *transferred, u32 flags) { struct usbtmc_device_data *data = file_data->data; struct device *dev; u32 done = 0; u32 remaining; unsigned long expire; const u32 bufsize = USBTMC_BUFSIZE; struct urb *urb = NULL; int retval = 0; u32 timeout; *transferred = 0; /* Get pointer to private data structure */ dev = &data->intf->dev; dev_dbg(dev, "%s: size=%u flags=0x%X sema=%u\n", __func__, transfer_size, flags, file_data->limit_write_sem.count); if (flags & USBTMC_FLAG_APPEND) { spin_lock_irq(&file_data->err_lock); retval = file_data->out_status; spin_unlock_irq(&file_data->err_lock); if (retval < 0) return retval; } else { spin_lock_irq(&file_data->err_lock); file_data->out_transfer_size = 0; file_data->out_status = 0; spin_unlock_irq(&file_data->err_lock); } remaining = transfer_size; if (remaining > INT_MAX) remaining = INT_MAX; timeout = file_data->timeout; expire = msecs_to_jiffies(timeout); while (remaining > 0) { u32 this_part, aligned; u8 *buffer = NULL; if (flags & USBTMC_FLAG_ASYNC) { if (down_trylock(&file_data->limit_write_sem)) { retval = (done)?(0):(-EAGAIN); goto exit; } } else { retval = down_timeout(&file_data->limit_write_sem, expire); if (retval < 0) { retval = -ETIMEDOUT; goto error; } } spin_lock_irq(&file_data->err_lock); retval = file_data->out_status; spin_unlock_irq(&file_data->err_lock); if (retval < 0) { up(&file_data->limit_write_sem); goto error; } /* prepare next urb to send */ urb = usbtmc_create_urb(); if (!urb) { retval = -ENOMEM; up(&file_data->limit_write_sem); goto error; } buffer = urb->transfer_buffer; if (remaining > bufsize) this_part = bufsize; else this_part = remaining; if (copy_from_user(buffer, user_buffer + done, this_part)) { retval = -EFAULT; up(&file_data->limit_write_sem); goto error; } print_hex_dump_debug("usbtmc ", DUMP_PREFIX_NONE, 16, 1, buffer, this_part, true); /* fill bulk with 32 bit alignment to meet USBTMC specification * (size + 3 & ~3) rounds up and simplifies user code */ aligned = (this_part + 3) & ~3; dev_dbg(dev, "write(size:%u align:%u done:%u)\n", (unsigned int)this_part, (unsigned int)aligned, (unsigned int)done); usb_fill_bulk_urb(urb, data->usb_dev, usb_sndbulkpipe(data->usb_dev, data->bulk_out), urb->transfer_buffer, aligned, usbtmc_write_bulk_cb, file_data); usb_anchor_urb(urb, &file_data->submitted); retval = usb_submit_urb(urb, GFP_KERNEL); if (unlikely(retval)) { usb_unanchor_urb(urb); up(&file_data->limit_write_sem); goto error; } usb_free_urb(urb); urb = NULL; /* urb will be finally released by usb driver */ remaining -= this_part; done += this_part; } /* All urbs are on the fly */ if (!(flags & USBTMC_FLAG_ASYNC)) { if (!usb_wait_anchor_empty_timeout(&file_data->submitted, timeout)) { retval = -ETIMEDOUT; goto error; } } retval = 0; goto exit; error: usb_kill_anchored_urbs(&file_data->submitted); exit: usb_free_urb(urb); spin_lock_irq(&file_data->err_lock); if (!(flags & USBTMC_FLAG_ASYNC)) done = file_data->out_transfer_size; if (!retval && file_data->out_status) retval = file_data->out_status; spin_unlock_irq(&file_data->err_lock); *transferred = done; dev_dbg(dev, "%s: done=%u, retval=%d, urbstat=%d\n", __func__, done, retval, file_data->out_status); return retval; } static ssize_t usbtmc_ioctl_generic_write(struct usbtmc_file_data *file_data, void __user *arg) { struct usbtmc_message msg; ssize_t retval = 0; /* mutex already locked */ if (copy_from_user(&msg, arg, sizeof(struct usbtmc_message))) return -EFAULT; retval = usbtmc_generic_write(file_data, msg.message, msg.transfer_size, &msg.transferred, msg.flags); if (put_user(msg.transferred, &((struct usbtmc_message __user *)arg)->transferred)) return -EFAULT; return retval; } /* * Get the generic write result */ static ssize_t usbtmc_ioctl_write_result(struct usbtmc_file_data *file_data, void __user *arg) { u32 transferred; int retval; spin_lock_irq(&file_data->err_lock); transferred = file_data->out_transfer_size; retval = file_data->out_status; spin_unlock_irq(&file_data->err_lock); if (put_user(transferred, (__u32 __user *)arg)) return -EFAULT; return retval; } /* * Sends a REQUEST_DEV_DEP_MSG_IN message on the Bulk-OUT endpoint. * @transfer_size: number of bytes to request from the device. * * See the USBTMC specification, Table 4. * * Also updates bTag_last_write. */ static int send_request_dev_dep_msg_in(struct usbtmc_file_data *file_data, u32 transfer_size) { struct usbtmc_device_data *data = file_data->data; int retval; u8 *buffer; int actual; buffer = kmalloc(USBTMC_HEADER_SIZE, GFP_KERNEL); if (!buffer) return -ENOMEM; /* Setup IO buffer for REQUEST_DEV_DEP_MSG_IN message * Refer to class specs for details */ buffer[0] = 2; buffer[1] = data->bTag; buffer[2] = ~data->bTag; buffer[3] = 0; /* Reserved */ buffer[4] = transfer_size >> 0; buffer[5] = transfer_size >> 8; buffer[6] = transfer_size >> 16; buffer[7] = transfer_size >> 24; buffer[8] = file_data->term_char_enabled * 2; /* Use term character? */ buffer[9] = file_data->term_char; buffer[10] = 0; /* Reserved */ buffer[11] = 0; /* Reserved */ /* Send bulk URB */ retval = usb_bulk_msg(data->usb_dev, usb_sndbulkpipe(data->usb_dev, data->bulk_out), buffer, USBTMC_HEADER_SIZE, &actual, file_data->timeout); /* Store bTag (in case we need to abort) */ data->bTag_last_write = data->bTag; /* Increment bTag -- and increment again if zero */ data->bTag++; if (!data->bTag) data->bTag++; kfree(buffer); if (retval < 0) dev_err(&data->intf->dev, "%s returned %d\n", __func__, retval); return retval; } static ssize_t usbtmc_read(struct file *filp, char __user *buf, size_t count, loff_t *f_pos) { struct usbtmc_file_data *file_data; struct usbtmc_device_data *data; struct device *dev; const u32 bufsize = USBTMC_BUFSIZE; u32 n_characters; u8 *buffer; int actual; u32 done = 0; u32 remaining; int retval; /* Get pointer to private data structure */ file_data = filp->private_data; data = file_data->data; dev = &data->intf->dev; buffer = kmalloc(bufsize, GFP_KERNEL); if (!buffer) return -ENOMEM; retval = mutex_lock_interruptible(&data->io_mutex); if (retval < 0) goto exit_nolock; if (data->zombie) { retval = -ENODEV; goto exit; } if (count > INT_MAX) count = INT_MAX; dev_dbg(dev, "%s(count:%zu)\n", __func__, count); retval = send_request_dev_dep_msg_in(file_data, count); if (retval < 0) { if (file_data->auto_abort) usbtmc_ioctl_abort_bulk_out(data); goto exit; } /* Loop until we have fetched everything we requested */ remaining = count; actual = 0; /* Send bulk URB */ retval = usb_bulk_msg(data->usb_dev, usb_rcvbulkpipe(data->usb_dev, data->bulk_in), buffer, bufsize, &actual, file_data->timeout); dev_dbg(dev, "%s: bulk_msg retval(%u), actual(%d)\n", __func__, retval, actual); /* Store bTag (in case we need to abort) */ data->bTag_last_read = data->bTag; if (retval < 0) { if (file_data->auto_abort) usbtmc_ioctl_abort_bulk_in(data); goto exit; } /* Sanity checks for the header */ if (actual < USBTMC_HEADER_SIZE) { dev_err(dev, "Device sent too small first packet: %u < %u\n", actual, USBTMC_HEADER_SIZE); if (file_data->auto_abort) usbtmc_ioctl_abort_bulk_in(data); goto exit; } if (buffer[0] != 2) { dev_err(dev, "Device sent reply with wrong MsgID: %u != 2\n", buffer[0]); if (file_data->auto_abort) usbtmc_ioctl_abort_bulk_in(data); goto exit; } if (buffer[1] != data->bTag_last_write) { dev_err(dev, "Device sent reply with wrong bTag: %u != %u\n", buffer[1], data->bTag_last_write); if (file_data->auto_abort) usbtmc_ioctl_abort_bulk_in(data); goto exit; } /* How many characters did the instrument send? */ n_characters = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24); file_data->bmTransferAttributes = buffer[8]; dev_dbg(dev, "Bulk-IN header: N_characters(%u), bTransAttr(%u)\n", n_characters, buffer[8]); if (n_characters > remaining) { dev_err(dev, "Device wants to return more data than requested: %u > %zu\n", n_characters, count); if (file_data->auto_abort) usbtmc_ioctl_abort_bulk_in(data); goto exit; } print_hex_dump_debug("usbtmc ", DUMP_PREFIX_NONE, 16, 1, buffer, actual, true); remaining = n_characters; /* Remove the USBTMC header */ actual -= USBTMC_HEADER_SIZE; /* Remove padding if it exists */ if (actual > remaining) actual = remaining; remaining -= actual; /* Copy buffer to user space */ if (copy_to_user(buf, &buffer[USBTMC_HEADER_SIZE], actual)) { /* There must have been an addressing problem */ retval = -EFAULT; goto exit; } if ((actual + USBTMC_HEADER_SIZE) == bufsize) { retval = usbtmc_generic_read(file_data, buf + actual, remaining, &done, USBTMC_FLAG_IGNORE_TRAILER); if (retval < 0) goto exit; } done += actual; /* Update file position value */ *f_pos = *f_pos + done; retval = done; exit: mutex_unlock(&data->io_mutex); exit_nolock: kfree(buffer); return retval; } static ssize_t usbtmc_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos) { struct usbtmc_file_data *file_data; struct usbtmc_device_data *data; struct urb *urb = NULL; ssize_t retval = 0; u8 *buffer; u32 remaining, done; u32 transfersize, aligned, buflen; file_data = filp->private_data; data = file_data->data; mutex_lock(&data->io_mutex); if (data->zombie) { retval = -ENODEV; goto exit; } done = 0; spin_lock_irq(&file_data->err_lock); file_data->out_transfer_size = 0; file_data->out_status = 0; spin_unlock_irq(&file_data->err_lock); if (!count) goto exit; if (down_trylock(&file_data->limit_write_sem)) { /* previous calls were async */ retval = -EBUSY; goto exit; } urb = usbtmc_create_urb(); if (!urb) { retval = -ENOMEM; up(&file_data->limit_write_sem); goto exit; } buffer = urb->transfer_buffer; buflen = urb->transfer_buffer_length; if (count > INT_MAX) { transfersize = INT_MAX; buffer[8] = 0; } else { transfersize = count; buffer[8] = file_data->eom_val; } /* Setup IO buffer for DEV_DEP_MSG_OUT message */ buffer[0] = 1; buffer[1] = data->bTag; buffer[2] = ~data->bTag; buffer[3] = 0; /* Reserved */ buffer[4] = transfersize >> 0; buffer[5] = transfersize >> 8; buffer[6] = transfersize >> 16; buffer[7] = transfersize >> 24; /* buffer[8] is set above... */ buffer[9] = 0; /* Reserved */ buffer[10] = 0; /* Reserved */ buffer[11] = 0; /* Reserved */ remaining = transfersize; if (transfersize + USBTMC_HEADER_SIZE > buflen) { transfersize = buflen - USBTMC_HEADER_SIZE; aligned = buflen; } else { aligned = (transfersize + (USBTMC_HEADER_SIZE + 3)) & ~3; } if (copy_from_user(&buffer[USBTMC_HEADER_SIZE], buf, transfersize)) { retval = -EFAULT; up(&file_data->limit_write_sem); goto exit; } dev_dbg(&data->intf->dev, "%s(size:%u align:%u)\n", __func__, (unsigned int)transfersize, (unsigned int)aligned); print_hex_dump_debug("usbtmc ", DUMP_PREFIX_NONE, 16, 1, buffer, aligned, true); usb_fill_bulk_urb(urb, data->usb_dev, usb_sndbulkpipe(data->usb_dev, data->bulk_out), urb->transfer_buffer, aligned, usbtmc_write_bulk_cb, file_data); usb_anchor_urb(urb, &file_data->submitted); retval = usb_submit_urb(urb, GFP_KERNEL); if (unlikely(retval)) { usb_unanchor_urb(urb); up(&file_data->limit_write_sem); goto exit; } remaining -= transfersize; data->bTag_last_write = data->bTag; data->bTag++; if (!data->bTag) data->bTag++; /* call generic_write even when remaining = 0 */ retval = usbtmc_generic_write(file_data, buf + transfersize, remaining, &done, USBTMC_FLAG_APPEND); /* truncate alignment bytes */ if (done > remaining) done = remaining; /*add size of first urb*/ done += transfersize; if (retval < 0) { usb_kill_anchored_urbs(&file_data->submitted); dev_err(&data->intf->dev, "Unable to send data, error %d\n", (int)retval); if (file_data->auto_abort) usbtmc_ioctl_abort_bulk_out(data); goto exit; } retval = done; exit: usb_free_urb(urb); mutex_unlock(&data->io_mutex); return retval; } static int usbtmc_ioctl_clear(struct usbtmc_device_data *data) { struct device *dev; u8 *buffer; int rv; int n; int actual = 0; dev = &data->intf->dev; dev_dbg(dev, "Sending INITIATE_CLEAR request\n"); buffer = kmalloc(USBTMC_BUFSIZE, GFP_KERNEL); if (!buffer) return -ENOMEM; rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_INITIATE_CLEAR, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, 0, buffer, 1, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } dev_dbg(dev, "INITIATE_CLEAR returned %x\n", buffer[0]); if (buffer[0] != USBTMC_STATUS_SUCCESS) { dev_err(dev, "INITIATE_CLEAR returned %x\n", buffer[0]); rv = -EPERM; goto exit; } n = 0; usbtmc_clear_check_status: dev_dbg(dev, "Sending CHECK_CLEAR_STATUS request\n"); rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_CHECK_CLEAR_STATUS, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, 0, buffer, 2, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } dev_dbg(dev, "CHECK_CLEAR_STATUS returned %x\n", buffer[0]); if (buffer[0] == USBTMC_STATUS_SUCCESS) goto usbtmc_clear_bulk_out_halt; if (buffer[0] != USBTMC_STATUS_PENDING) { dev_err(dev, "CHECK_CLEAR_STATUS returned %x\n", buffer[0]); rv = -EPERM; goto exit; } if ((buffer[1] & 1) != 0) { do { dev_dbg(dev, "Reading from bulk in EP\n"); actual = 0; rv = usb_bulk_msg(data->usb_dev, usb_rcvbulkpipe(data->usb_dev, data->bulk_in), buffer, USBTMC_BUFSIZE, &actual, USB_CTRL_GET_TIMEOUT); print_hex_dump_debug("usbtmc ", DUMP_PREFIX_NONE, 16, 1, buffer, actual, true); n++; if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } } while ((actual == USBTMC_BUFSIZE) && (n < USBTMC_MAX_READS_TO_CLEAR_BULK_IN)); } else { /* do not stress device with subsequent requests */ msleep(50); n++; } if (n >= USBTMC_MAX_READS_TO_CLEAR_BULK_IN) { dev_err(dev, "Couldn't clear device buffer within %d cycles\n", USBTMC_MAX_READS_TO_CLEAR_BULK_IN); rv = -EPERM; goto exit; } goto usbtmc_clear_check_status; usbtmc_clear_bulk_out_halt: rv = usb_clear_halt(data->usb_dev, usb_sndbulkpipe(data->usb_dev, data->bulk_out)); if (rv < 0) { dev_err(dev, "usb_clear_halt returned %d\n", rv); goto exit; } rv = 0; exit: kfree(buffer); return rv; } static int usbtmc_ioctl_clear_out_halt(struct usbtmc_device_data *data) { int rv; rv = usb_clear_halt(data->usb_dev, usb_sndbulkpipe(data->usb_dev, data->bulk_out)); if (rv < 0) dev_err(&data->usb_dev->dev, "%s returned %d\n", __func__, rv); return rv; } static int usbtmc_ioctl_clear_in_halt(struct usbtmc_device_data *data) { int rv; rv = usb_clear_halt(data->usb_dev, usb_rcvbulkpipe(data->usb_dev, data->bulk_in)); if (rv < 0) dev_err(&data->usb_dev->dev, "%s returned %d\n", __func__, rv); return rv; } static int usbtmc_ioctl_cancel_io(struct usbtmc_file_data *file_data) { spin_lock_irq(&file_data->err_lock); file_data->in_status = -ECANCELED; file_data->out_status = -ECANCELED; spin_unlock_irq(&file_data->err_lock); usb_kill_anchored_urbs(&file_data->submitted); return 0; } static int usbtmc_ioctl_cleanup_io(struct usbtmc_file_data *file_data) { usb_kill_anchored_urbs(&file_data->submitted); usb_scuttle_anchored_urbs(&file_data->in_anchor); spin_lock_irq(&file_data->err_lock); file_data->in_status = 0; file_data->in_transfer_size = 0; file_data->out_status = 0; file_data->out_transfer_size = 0; spin_unlock_irq(&file_data->err_lock); file_data->in_urbs_used = 0; return 0; } static int get_capabilities(struct usbtmc_device_data *data) { struct device *dev = &data->usb_dev->dev; char *buffer; int rv = 0; buffer = kmalloc(0x18, GFP_KERNEL); if (!buffer) return -ENOMEM; rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_GET_CAPABILITIES, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, 0, buffer, 0x18, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto err_out; } dev_dbg(dev, "GET_CAPABILITIES returned %x\n", buffer[0]); if (buffer[0] != USBTMC_STATUS_SUCCESS) { dev_err(dev, "GET_CAPABILITIES returned %x\n", buffer[0]); rv = -EPERM; goto err_out; } dev_dbg(dev, "Interface capabilities are %x\n", buffer[4]); dev_dbg(dev, "Device capabilities are %x\n", buffer[5]); dev_dbg(dev, "USB488 interface capabilities are %x\n", buffer[14]); dev_dbg(dev, "USB488 device capabilities are %x\n", buffer[15]); data->capabilities.interface_capabilities = buffer[4]; data->capabilities.device_capabilities = buffer[5]; data->capabilities.usb488_interface_capabilities = buffer[14]; data->capabilities.usb488_device_capabilities = buffer[15]; data->usb488_caps = (buffer[14] & 0x07) | ((buffer[15] & 0x0f) << 4); rv = 0; err_out: kfree(buffer); return rv; } #define capability_attribute(name) \ static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct usb_interface *intf = to_usb_interface(dev); \ struct usbtmc_device_data *data = usb_get_intfdata(intf); \ \ return sprintf(buf, "%d\n", data->capabilities.name); \ } \ static DEVICE_ATTR_RO(name) capability_attribute(interface_capabilities); capability_attribute(device_capabilities); capability_attribute(usb488_interface_capabilities); capability_attribute(usb488_device_capabilities); static struct attribute *usbtmc_attrs[] = { &dev_attr_interface_capabilities.attr, &dev_attr_device_capabilities.attr, &dev_attr_usb488_interface_capabilities.attr, &dev_attr_usb488_device_capabilities.attr, NULL, }; ATTRIBUTE_GROUPS(usbtmc); static int usbtmc_ioctl_indicator_pulse(struct usbtmc_device_data *data) { struct device *dev; u8 *buffer; int rv; dev = &data->intf->dev; buffer = kmalloc(2, GFP_KERNEL); if (!buffer) return -ENOMEM; rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_INDICATOR_PULSE, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, 0, buffer, 0x01, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } dev_dbg(dev, "INDICATOR_PULSE returned %x\n", buffer[0]); if (buffer[0] != USBTMC_STATUS_SUCCESS) { dev_err(dev, "INDICATOR_PULSE returned %x\n", buffer[0]); rv = -EPERM; goto exit; } rv = 0; exit: kfree(buffer); return rv; } static int usbtmc_ioctl_request(struct usbtmc_device_data *data, void __user *arg) { struct device *dev = &data->intf->dev; struct usbtmc_ctrlrequest request; u8 *buffer = NULL; int rv; unsigned int is_in, pipe; if (copy_from_user(&request, arg, sizeof(struct usbtmc_ctrlrequest))) return -EFAULT; if (request.req.wLength > USBTMC_BUFSIZE) return -EMSGSIZE; if (request.req.wLength == 0) /* Length-0 requests are never IN */ request.req.bRequestType &= ~USB_DIR_IN; is_in = request.req.bRequestType & USB_DIR_IN; if (request.req.wLength) { buffer = kmalloc(request.req.wLength, GFP_KERNEL); if (!buffer) return -ENOMEM; if (!is_in) { /* Send control data to device */ if (copy_from_user(buffer, request.data, request.req.wLength)) { rv = -EFAULT; goto exit; } } } if (is_in) pipe = usb_rcvctrlpipe(data->usb_dev, 0); else pipe = usb_sndctrlpipe(data->usb_dev, 0); rv = usb_control_msg(data->usb_dev, pipe, request.req.bRequest, request.req.bRequestType, request.req.wValue, request.req.wIndex, buffer, request.req.wLength, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "%s failed %d\n", __func__, rv); goto exit; } if (rv && is_in) { /* Read control data from device */ if (copy_to_user(request.data, buffer, rv)) rv = -EFAULT; } exit: kfree(buffer); return rv; } /* * Get the usb timeout value */ static int usbtmc_ioctl_get_timeout(struct usbtmc_file_data *file_data, void __user *arg) { u32 timeout; timeout = file_data->timeout; return put_user(timeout, (__u32 __user *)arg); } /* * Set the usb timeout value */ static int usbtmc_ioctl_set_timeout(struct usbtmc_file_data *file_data, void __user *arg) { u32 timeout; if (get_user(timeout, (__u32 __user *)arg)) return -EFAULT; /* Note that timeout = 0 means * MAX_SCHEDULE_TIMEOUT in usb_control_msg */ if (timeout < USBTMC_MIN_TIMEOUT) return -EINVAL; file_data->timeout = timeout; return 0; } /* * enables/disables sending EOM on write */ static int usbtmc_ioctl_eom_enable(struct usbtmc_file_data *file_data, void __user *arg) { u8 eom_enable; if (copy_from_user(&eom_enable, arg, sizeof(eom_enable))) return -EFAULT; if (eom_enable > 1) return -EINVAL; file_data->eom_val = eom_enable; return 0; } /* * Configure termination character for read() */ static int usbtmc_ioctl_config_termc(struct usbtmc_file_data *file_data, void __user *arg) { struct usbtmc_termchar termc; if (copy_from_user(&termc, arg, sizeof(termc))) return -EFAULT; if ((termc.term_char_enabled > 1) || (termc.term_char_enabled && !(file_data->data->capabilities.device_capabilities & 1))) return -EINVAL; file_data->term_char = termc.term_char; file_data->term_char_enabled = termc.term_char_enabled; return 0; } static long usbtmc_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct usbtmc_file_data *file_data; struct usbtmc_device_data *data; int retval = -EBADRQC; __u8 tmp_byte; file_data = file->private_data; data = file_data->data; mutex_lock(&data->io_mutex); if (data->zombie) { retval = -ENODEV; goto skip_io_on_zombie; } switch (cmd) { case USBTMC_IOCTL_CLEAR_OUT_HALT: retval = usbtmc_ioctl_clear_out_halt(data); break; case USBTMC_IOCTL_CLEAR_IN_HALT: retval = usbtmc_ioctl_clear_in_halt(data); break; case USBTMC_IOCTL_INDICATOR_PULSE: retval = usbtmc_ioctl_indicator_pulse(data); break; case USBTMC_IOCTL_CLEAR: retval = usbtmc_ioctl_clear(data); break; case USBTMC_IOCTL_ABORT_BULK_OUT: retval = usbtmc_ioctl_abort_bulk_out(data); break; case USBTMC_IOCTL_ABORT_BULK_IN: retval = usbtmc_ioctl_abort_bulk_in(data); break; case USBTMC_IOCTL_CTRL_REQUEST: retval = usbtmc_ioctl_request(data, (void __user *)arg); break; case USBTMC_IOCTL_GET_TIMEOUT: retval = usbtmc_ioctl_get_timeout(file_data, (void __user *)arg); break; case USBTMC_IOCTL_SET_TIMEOUT: retval = usbtmc_ioctl_set_timeout(file_data, (void __user *)arg); break; case USBTMC_IOCTL_EOM_ENABLE: retval = usbtmc_ioctl_eom_enable(file_data, (void __user *)arg); break; case USBTMC_IOCTL_CONFIG_TERMCHAR: retval = usbtmc_ioctl_config_termc(file_data, (void __user *)arg); break; case USBTMC_IOCTL_WRITE: retval = usbtmc_ioctl_generic_write(file_data, (void __user *)arg); break; case USBTMC_IOCTL_READ: retval = usbtmc_ioctl_generic_read(file_data, (void __user *)arg); break; case USBTMC_IOCTL_WRITE_RESULT: retval = usbtmc_ioctl_write_result(file_data, (void __user *)arg); break; case USBTMC_IOCTL_API_VERSION: retval = put_user(USBTMC_API_VERSION, (__u32 __user *)arg); break; case USBTMC488_IOCTL_GET_CAPS: retval = put_user(data->usb488_caps, (unsigned char __user *)arg); break; case USBTMC488_IOCTL_READ_STB: retval = usbtmc488_ioctl_read_stb(file_data, (void __user *)arg); break; case USBTMC488_IOCTL_REN_CONTROL: retval = usbtmc488_ioctl_simple(data, (void __user *)arg, USBTMC488_REQUEST_REN_CONTROL); break; case USBTMC488_IOCTL_GOTO_LOCAL: retval = usbtmc488_ioctl_simple(data, (void __user *)arg, USBTMC488_REQUEST_GOTO_LOCAL); break; case USBTMC488_IOCTL_LOCAL_LOCKOUT: retval = usbtmc488_ioctl_simple(data, (void __user *)arg, USBTMC488_REQUEST_LOCAL_LOCKOUT); break; case USBTMC488_IOCTL_TRIGGER: retval = usbtmc488_ioctl_trigger(file_data); break; case USBTMC488_IOCTL_WAIT_SRQ: retval = usbtmc488_ioctl_wait_srq(file_data, (__u32 __user *)arg); break; case USBTMC_IOCTL_MSG_IN_ATTR: retval = put_user(file_data->bmTransferAttributes, (__u8 __user *)arg); break; case USBTMC_IOCTL_AUTO_ABORT: retval = get_user(tmp_byte, (unsigned char __user *)arg); if (retval == 0) file_data->auto_abort = !!tmp_byte; break; case USBTMC_IOCTL_GET_STB: retval = usbtmc_get_stb(file_data, &tmp_byte); if (!retval) retval = put_user(tmp_byte, (__u8 __user *)arg); break; case USBTMC_IOCTL_GET_SRQ_STB: retval = usbtmc_ioctl_get_srq_stb(file_data, (void __user *)arg); break; case USBTMC_IOCTL_CANCEL_IO: retval = usbtmc_ioctl_cancel_io(file_data); break; case USBTMC_IOCTL_CLEANUP_IO: retval = usbtmc_ioctl_cleanup_io(file_data); break; } skip_io_on_zombie: mutex_unlock(&data->io_mutex); return retval; } static int usbtmc_fasync(int fd, struct file *file, int on) { struct usbtmc_file_data *file_data = file->private_data; return fasync_helper(fd, file, on, &file_data->data->fasync); } static __poll_t usbtmc_poll(struct file *file, poll_table *wait) { struct usbtmc_file_data *file_data = file->private_data; struct usbtmc_device_data *data = file_data->data; __poll_t mask; mutex_lock(&data->io_mutex); if (data->zombie) { mask = EPOLLHUP | EPOLLERR; goto no_poll; } poll_wait(file, &data->waitq, wait); /* Note that EPOLLPRI is now assigned to SRQ, and * EPOLLIN|EPOLLRDNORM to normal read data. */ mask = 0; if (atomic_read(&file_data->srq_asserted)) mask |= EPOLLPRI; /* Note that the anchor submitted includes all urbs for BULK IN * and OUT. So EPOLLOUT is signaled when BULK OUT is empty and * all BULK IN urbs are completed and moved to in_anchor. */ if (usb_anchor_empty(&file_data->submitted)) mask |= (EPOLLOUT | EPOLLWRNORM); if (!usb_anchor_empty(&file_data->in_anchor)) mask |= (EPOLLIN | EPOLLRDNORM); spin_lock_irq(&file_data->err_lock); if (file_data->in_status || file_data->out_status) mask |= EPOLLERR; spin_unlock_irq(&file_data->err_lock); dev_dbg(&data->intf->dev, "poll mask = %x\n", mask); no_poll: mutex_unlock(&data->io_mutex); return mask; } static const struct file_operations fops = { .owner = THIS_MODULE, .read = usbtmc_read, .write = usbtmc_write, .open = usbtmc_open, .release = usbtmc_release, .flush = usbtmc_flush, .unlocked_ioctl = usbtmc_ioctl, .compat_ioctl = compat_ptr_ioctl, .fasync = usbtmc_fasync, .poll = usbtmc_poll, .llseek = default_llseek, }; static struct usb_class_driver usbtmc_class = { .name = "usbtmc%d", .fops = &fops, .minor_base = USBTMC_MINOR_BASE, }; static void usbtmc_interrupt(struct urb *urb) { struct usbtmc_device_data *data = urb->context; struct device *dev = &data->intf->dev; int status = urb->status; int rv; dev_dbg(&data->intf->dev, "int status: %d len %d\n", status, urb->actual_length); switch (status) { case 0: /* SUCCESS */ /* check for valid STB notification */ if (data->iin_buffer[0] > 0x81) { data->bNotify1 = data->iin_buffer[0]; data->bNotify2 = data->iin_buffer[1]; atomic_set(&data->iin_data_valid, 1); wake_up_interruptible(&data->waitq); goto exit; } /* check for SRQ notification */ if (data->iin_buffer[0] == 0x81) { unsigned long flags; struct list_head *elem; if (data->fasync) kill_fasync(&data->fasync, SIGIO, POLL_PRI); spin_lock_irqsave(&data->dev_lock, flags); list_for_each(elem, &data->file_list) { struct usbtmc_file_data *file_data; file_data = list_entry(elem, struct usbtmc_file_data, file_elem); file_data->srq_byte = data->iin_buffer[1]; atomic_set(&file_data->srq_asserted, 1); } spin_unlock_irqrestore(&data->dev_lock, flags); dev_dbg(dev, "srq received bTag %x stb %x\n", (unsigned int)data->iin_buffer[0], (unsigned int)data->iin_buffer[1]); wake_up_interruptible_all(&data->waitq); goto exit; } dev_warn(dev, "invalid notification: %x\n", data->iin_buffer[0]); break; case -EOVERFLOW: dev_err(dev, "overflow with length %d, actual length is %d\n", data->iin_wMaxPacketSize, urb->actual_length); fallthrough; default: /* urb terminated, clean up */ dev_dbg(dev, "urb terminated, status: %d\n", status); return; } exit: rv = usb_submit_urb(urb, GFP_ATOMIC); if (rv) dev_err(dev, "usb_submit_urb failed: %d\n", rv); } static void usbtmc_free_int(struct usbtmc_device_data *data) { if (!data->iin_ep_present || !data->iin_urb) return; usb_kill_urb(data->iin_urb); kfree(data->iin_buffer); data->iin_buffer = NULL; usb_free_urb(data->iin_urb); data->iin_urb = NULL; kref_put(&data->kref, usbtmc_delete); } static int usbtmc_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usbtmc_device_data *data; struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *bulk_in, *bulk_out, *int_in; int retcode; dev_dbg(&intf->dev, "%s called\n", __func__); data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->intf = intf; data->id = id; data->usb_dev = usb_get_dev(interface_to_usbdev(intf)); usb_set_intfdata(intf, data); kref_init(&data->kref); mutex_init(&data->io_mutex); init_waitqueue_head(&data->waitq); atomic_set(&data->iin_data_valid, 0); INIT_LIST_HEAD(&data->file_list); spin_lock_init(&data->dev_lock); data->zombie = 0; /* Initialize USBTMC bTag and other fields */ data->bTag = 1; /* 2 <= bTag <= 127 USBTMC-USB488 subclass specification 4.3.1 */ data->iin_bTag = 2; /* USBTMC devices have only one setting, so use that */ iface_desc = data->intf->cur_altsetting; data->ifnum = iface_desc->desc.bInterfaceNumber; /* Find bulk endpoints */ retcode = usb_find_common_endpoints(iface_desc, &bulk_in, &bulk_out, NULL, NULL); if (retcode) { dev_err(&intf->dev, "bulk endpoints not found\n"); goto err_put; } retcode = -EINVAL; data->bulk_in = bulk_in->bEndpointAddress; data->wMaxPacketSize = usb_endpoint_maxp(bulk_in); if (!data->wMaxPacketSize) goto err_put; dev_dbg(&intf->dev, "Found bulk in endpoint at %u\n", data->bulk_in); data->bulk_out = bulk_out->bEndpointAddress; dev_dbg(&intf->dev, "Found Bulk out endpoint at %u\n", data->bulk_out); /* Find int endpoint */ retcode = usb_find_int_in_endpoint(iface_desc, &int_in); if (!retcode) { data->iin_ep_present = 1; data->iin_ep = int_in->bEndpointAddress; data->iin_wMaxPacketSize = usb_endpoint_maxp(int_in); data->iin_interval = int_in->bInterval; dev_dbg(&intf->dev, "Found Int in endpoint at %u\n", data->iin_ep); } retcode = get_capabilities(data); if (retcode) dev_err(&intf->dev, "can't read capabilities\n"); if (data->iin_ep_present) { /* allocate int urb */ data->iin_urb = usb_alloc_urb(0, GFP_KERNEL); if (!data->iin_urb) { retcode = -ENOMEM; goto error_register; } /* Protect interrupt in endpoint data until iin_urb is freed */ kref_get(&data->kref); /* allocate buffer for interrupt in */ data->iin_buffer = kmalloc(data->iin_wMaxPacketSize, GFP_KERNEL); if (!data->iin_buffer) { retcode = -ENOMEM; goto error_register; } /* fill interrupt urb */ usb_fill_int_urb(data->iin_urb, data->usb_dev, usb_rcvintpipe(data->usb_dev, data->iin_ep), data->iin_buffer, data->iin_wMaxPacketSize, usbtmc_interrupt, data, data->iin_interval); retcode = usb_submit_urb(data->iin_urb, GFP_KERNEL); if (retcode) { dev_err(&intf->dev, "Failed to submit iin_urb\n"); goto error_register; } } retcode = usb_register_dev(intf, &usbtmc_class); if (retcode) { dev_err(&intf->dev, "Not able to get a minor (base %u, slice default): %d\n", USBTMC_MINOR_BASE, retcode); goto error_register; } dev_dbg(&intf->dev, "Using minor number %d\n", intf->minor); return 0; error_register: usbtmc_free_int(data); err_put: kref_put(&data->kref, usbtmc_delete); return retcode; } static void usbtmc_disconnect(struct usb_interface *intf) { struct usbtmc_device_data *data = usb_get_intfdata(intf); struct list_head *elem; usb_deregister_dev(intf, &usbtmc_class); mutex_lock(&data->io_mutex); data->zombie = 1; wake_up_interruptible_all(&data->waitq); list_for_each(elem, &data->file_list) { struct usbtmc_file_data *file_data; file_data = list_entry(elem, struct usbtmc_file_data, file_elem); usb_kill_anchored_urbs(&file_data->submitted); usb_scuttle_anchored_urbs(&file_data->in_anchor); } mutex_unlock(&data->io_mutex); usbtmc_free_int(data); kref_put(&data->kref, usbtmc_delete); } static void usbtmc_draw_down(struct usbtmc_file_data *file_data) { int time; time = usb_wait_anchor_empty_timeout(&file_data->submitted, 1000); if (!time) usb_kill_anchored_urbs(&file_data->submitted); usb_scuttle_anchored_urbs(&file_data->in_anchor); } static int usbtmc_suspend(struct usb_interface *intf, pm_message_t message) { struct usbtmc_device_data *data = usb_get_intfdata(intf); struct list_head *elem; if (!data) return 0; mutex_lock(&data->io_mutex); list_for_each(elem, &data->file_list) { struct usbtmc_file_data *file_data; file_data = list_entry(elem, struct usbtmc_file_data, file_elem); usbtmc_draw_down(file_data); } if (data->iin_ep_present && data->iin_urb) usb_kill_urb(data->iin_urb); mutex_unlock(&data->io_mutex); return 0; } static int usbtmc_resume(struct usb_interface *intf) { struct usbtmc_device_data *data = usb_get_intfdata(intf); int retcode = 0; if (data->iin_ep_present && data->iin_urb) retcode = usb_submit_urb(data->iin_urb, GFP_KERNEL); if (retcode) dev_err(&intf->dev, "Failed to submit iin_urb\n"); return retcode; } static int usbtmc_pre_reset(struct usb_interface *intf) { struct usbtmc_device_data *data = usb_get_intfdata(intf); struct list_head *elem; if (!data) return 0; mutex_lock(&data->io_mutex); list_for_each(elem, &data->file_list) { struct usbtmc_file_data *file_data; file_data = list_entry(elem, struct usbtmc_file_data, file_elem); usbtmc_ioctl_cancel_io(file_data); } return 0; } static int usbtmc_post_reset(struct usb_interface *intf) { struct usbtmc_device_data *data = usb_get_intfdata(intf); mutex_unlock(&data->io_mutex); return 0; } static struct usb_driver usbtmc_driver = { .name = "usbtmc", .id_table = usbtmc_devices, .probe = usbtmc_probe, .disconnect = usbtmc_disconnect, .suspend = usbtmc_suspend, .resume = usbtmc_resume, .pre_reset = usbtmc_pre_reset, .post_reset = usbtmc_post_reset, .dev_groups = usbtmc_groups, }; module_usb_driver(usbtmc_driver); MODULE_DESCRIPTION("USB Test & Measurement class driver"); MODULE_LICENSE("GPL");
3 3 3 1 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for logging packets. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/ip.h> #include <net/ipv6.h> #include <net/icmp.h> #include <net/udp.h> #include <net/tcp.h> #include <net/route.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_LOG.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <net/netfilter/nf_log.h> static unsigned int log_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_log_info *loginfo = par->targinfo; struct net *net = xt_net(par); struct nf_loginfo li; li.type = NF_LOG_TYPE_LOG; li.u.log.level = loginfo->level; li.u.log.logflags = loginfo->logflags; nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par), xt_out(par), &li, "%s", loginfo->prefix); return XT_CONTINUE; } static int log_tg_check(const struct xt_tgchk_param *par) { const struct xt_log_info *loginfo = par->targinfo; int ret; if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6) return -EINVAL; if (loginfo->level >= 8) { pr_debug("level %u >= 8\n", loginfo->level); return -EINVAL; } if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { pr_debug("prefix is not null-terminated\n"); return -EINVAL; } ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); if (ret != 0 && !par->nft_compat) { request_module("%s", "nf_log_syslog"); ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); } return ret; } static void log_tg_destroy(const struct xt_tgdtor_param *par) { nf_logger_put(par->family, NF_LOG_TYPE_LOG); } static struct xt_target log_tg_regs[] __read_mostly = { { .name = "LOG", .family = NFPROTO_IPV4, .target = log_tg, .targetsize = sizeof(struct xt_log_info), .checkentry = log_tg_check, .destroy = log_tg_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "LOG", .family = NFPROTO_IPV6, .target = log_tg, .targetsize = sizeof(struct xt_log_info), .checkentry = log_tg_check, .destroy = log_tg_destroy, .me = THIS_MODULE, }, #endif }; static int __init log_tg_init(void) { return xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); } static void __exit log_tg_exit(void) { xt_unregister_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); } module_init(log_tg_init); module_exit(log_tg_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>"); MODULE_DESCRIPTION("Xtables: IPv4/IPv6 packet logging"); MODULE_ALIAS("ipt_LOG"); MODULE_ALIAS("ip6t_LOG"); MODULE_SOFTDEP("pre: nf_log_syslog");
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 // SPDX-License-Identifier: GPL-2.0+ /* * Serial core port device driver * * Copyright (C) 2023 Texas Instruments Incorporated - https://www.ti.com/ * Author: Tony Lindgren <tony@atomide.com> */ #include <linux/device.h> #include <linux/module.h> #include <linux/of.h> #include <linux/platform_device.h> #include <linux/pm_runtime.h> #include <linux/pnp.h> #include <linux/property.h> #include <linux/serial_core.h> #include <linux/spinlock.h> #include "serial_base.h" #define SERIAL_PORT_AUTOSUSPEND_DELAY_MS 500 /* Only considers pending TX for now. Caller must take care of locking */ static int __serial_port_busy(struct uart_port *port) { return !uart_tx_stopped(port) && !kfifo_is_empty(&port->state->port.xmit_fifo); } static int serial_port_runtime_resume(struct device *dev) { struct serial_port_device *port_dev = to_serial_base_port_device(dev); struct uart_port *port; unsigned long flags; port = port_dev->port; if (port->flags & UPF_DEAD) goto out; /* Flush any pending TX for the port */ uart_port_lock_irqsave(port, &flags); if (!port_dev->tx_enabled) goto unlock; if (__serial_port_busy(port)) port->ops->start_tx(port); unlock: uart_port_unlock_irqrestore(port, flags); out: pm_runtime_mark_last_busy(dev); return 0; } static int serial_port_runtime_suspend(struct device *dev) { struct serial_port_device *port_dev = to_serial_base_port_device(dev); struct uart_port *port = port_dev->port; unsigned long flags; bool busy; if (port->flags & UPF_DEAD) return 0; /* * Nothing to do on pm_runtime_force_suspend(), see * DEFINE_RUNTIME_DEV_PM_OPS. */ if (!pm_runtime_enabled(dev)) return 0; uart_port_lock_irqsave(port, &flags); if (!port_dev->tx_enabled) { uart_port_unlock_irqrestore(port, flags); return 0; } busy = __serial_port_busy(port); if (busy) port->ops->start_tx(port); uart_port_unlock_irqrestore(port, flags); if (busy) pm_runtime_mark_last_busy(dev); return busy ? -EBUSY : 0; } static void serial_base_port_set_tx(struct uart_port *port, struct serial_port_device *port_dev, bool enabled) { unsigned long flags; uart_port_lock_irqsave(port, &flags); port_dev->tx_enabled = enabled; uart_port_unlock_irqrestore(port, flags); } void serial_base_port_startup(struct uart_port *port) { struct serial_port_device *port_dev = port->port_dev; serial_base_port_set_tx(port, port_dev, true); } void serial_base_port_shutdown(struct uart_port *port) { struct serial_port_device *port_dev = port->port_dev; serial_base_port_set_tx(port, port_dev, false); } static DEFINE_RUNTIME_DEV_PM_OPS(serial_port_pm, serial_port_runtime_suspend, serial_port_runtime_resume, NULL); static int serial_port_probe(struct device *dev) { pm_runtime_enable(dev); pm_runtime_set_autosuspend_delay(dev, SERIAL_PORT_AUTOSUSPEND_DELAY_MS); pm_runtime_use_autosuspend(dev); return 0; } static int serial_port_remove(struct device *dev) { pm_runtime_dont_use_autosuspend(dev); pm_runtime_disable(dev); return 0; } /* * Serial core port device init functions. Note that the physical serial * port device driver may not have completed probe at this point. */ int uart_add_one_port(struct uart_driver *drv, struct uart_port *port) { return serial_ctrl_register_port(drv, port); } EXPORT_SYMBOL(uart_add_one_port); void uart_remove_one_port(struct uart_driver *drv, struct uart_port *port) { serial_ctrl_unregister_port(drv, port); } EXPORT_SYMBOL(uart_remove_one_port); /** * __uart_read_properties - read firmware properties of the given UART port * @port: corresponding port * @use_defaults: apply defaults (when %true) or validate the values (when %false) * * The following device properties are supported: * - clock-frequency (optional) * - fifo-size (optional) * - no-loopback-test (optional) * - reg-shift (defaults may apply) * - reg-offset (value may be validated) * - reg-io-width (defaults may apply or value may be validated) * - interrupts (OF only) * - serial [alias ID] (OF only) * * If the port->dev is of struct platform_device type the interrupt line * will be retrieved via platform_get_irq() call against that device. * Otherwise it will be assigned by fwnode_irq_get() call. In both cases * the index 0 of the resource is used. * * The caller is responsible to initialize the following fields of the @port * ->dev (must be valid) * ->flags * ->iobase * ->mapbase * ->mapsize * ->regshift (if @use_defaults is false) * before calling this function. Alternatively the above mentioned fields * may be zeroed, in such case the only ones, that have associated properties * found, will be set to the respective values. * * If no error happened, the ->irq, ->mapbase, ->mapsize will be altered. * The ->iotype is always altered. * * When @use_defaults is true and the respective property is not found * the following values will be applied: * ->regshift = 0 * In this case IRQ must be provided, otherwise an error will be returned. * * When @use_defaults is false and the respective property is found * the following values will be validated: * - reg-io-width (->iotype) * - reg-offset (->mapsize against ->mapbase) * * Returns: 0 on success or negative errno on failure */ static int __uart_read_properties(struct uart_port *port, bool use_defaults) { struct device *dev = port->dev; u32 value; int ret; /* Read optional UART functional clock frequency */ device_property_read_u32(dev, "clock-frequency", &port->uartclk); /* Read the registers alignment (default: 8-bit) */ ret = device_property_read_u32(dev, "reg-shift", &value); if (ret) port->regshift = use_defaults ? 0 : port->regshift; else port->regshift = value; /* Read the registers I/O access type (default: MMIO 8-bit) */ ret = device_property_read_u32(dev, "reg-io-width", &value); if (ret) { port->iotype = port->iobase ? UPIO_PORT : UPIO_MEM; } else { switch (value) { case 1: port->iotype = UPIO_MEM; break; case 2: port->iotype = UPIO_MEM16; break; case 4: port->iotype = device_is_big_endian(dev) ? UPIO_MEM32BE : UPIO_MEM32; break; default: port->iotype = UPIO_UNKNOWN; break; } } if (!use_defaults && port->iotype == UPIO_UNKNOWN) { dev_err(dev, "Unsupported reg-io-width (%u)\n", value); return -EINVAL; } /* Read the address mapping base offset (default: no offset) */ ret = device_property_read_u32(dev, "reg-offset", &value); if (ret) value = 0; /* Check for shifted address mapping overflow */ if (!use_defaults && port->mapsize < value) { dev_err(dev, "reg-offset %u exceeds region size %pa\n", value, &port->mapsize); return -EINVAL; } port->mapbase += value; port->mapsize -= value; /* Read optional FIFO size */ device_property_read_u32(dev, "fifo-size", &port->fifosize); if (device_property_read_bool(dev, "no-loopback-test")) port->flags |= UPF_SKIP_TEST; /* Get index of serial line, if found in DT aliases */ ret = of_alias_get_id(dev_of_node(dev), "serial"); if (ret >= 0) port->line = ret; if (dev_is_platform(dev)) ret = platform_get_irq(to_platform_device(dev), 0); else if (dev_is_pnp(dev)) { ret = pnp_irq(to_pnp_dev(dev), 0); if (ret < 0) ret = -ENXIO; } else ret = fwnode_irq_get(dev_fwnode(dev), 0); if (ret == -EPROBE_DEFER) return ret; if (ret > 0) port->irq = ret; else if (use_defaults) /* By default IRQ support is mandatory */ return ret; else port->irq = 0; port->flags |= UPF_SHARE_IRQ; return 0; } int uart_read_port_properties(struct uart_port *port) { return __uart_read_properties(port, true); } EXPORT_SYMBOL_GPL(uart_read_port_properties); int uart_read_and_validate_port_properties(struct uart_port *port) { return __uart_read_properties(port, false); } EXPORT_SYMBOL_GPL(uart_read_and_validate_port_properties); static struct device_driver serial_port_driver = { .name = "port", .suppress_bind_attrs = true, .probe = serial_port_probe, .remove = serial_port_remove, .pm = pm_ptr(&serial_port_pm), }; int serial_base_port_init(void) { return serial_base_driver_register(&serial_port_driver); } void serial_base_port_exit(void) { serial_base_driver_unregister(&serial_port_driver); } MODULE_AUTHOR("Tony Lindgren <tony@atomide.com>"); MODULE_DESCRIPTION("Serial controller port driver"); MODULE_LICENSE("GPL");
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 13 13 13 13 13 1 13 11 2 9 9 9 9 2 2 2 1 1 1 2 2 2 2 1 1 1 2 2 2 1 1 2 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 11 11 11 11 11 11 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 11 11 10 9 2 2 2 2 9 13 13 13 13 13 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 // SPDX-License-Identifier: GPL-2.0-or-later /* * HID support for Linux * * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz> * Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc * Copyright (c) 2006-2012 Jiri Kosina */ /* */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/spinlock.h> #include <linux/unaligned.h> #include <asm/byteorder.h> #include <linux/input.h> #include <linux/wait.h> #include <linux/vmalloc.h> #include <linux/sched.h> #include <linux/semaphore.h> #include <linux/hid.h> #include <linux/hiddev.h> #include <linux/hid-debug.h> #include <linux/hidraw.h> #include "hid-ids.h" /* * Version Information */ #define DRIVER_DESC "HID core driver" static int hid_ignore_special_drivers = 0; module_param_named(ignore_special_drivers, hid_ignore_special_drivers, int, 0600); MODULE_PARM_DESC(ignore_special_drivers, "Ignore any special drivers and handle all devices by generic driver"); /* * Convert a signed n-bit integer to signed 32-bit integer. */ static s32 snto32(__u32 value, unsigned int n) { if (!value || !n) return 0; if (n > 32) n = 32; return sign_extend32(value, n - 1); } /* * Convert a signed 32-bit integer to a signed n-bit integer. */ static u32 s32ton(__s32 value, unsigned int n) { s32 a; if (!value || !n) return 0; a = value >> (n - 1); if (a && a != -1) return value < 0 ? 1 << (n - 1) : (1 << (n - 1)) - 1; return value & ((1 << n) - 1); } /* * Register a new report for a device. */ struct hid_report *hid_register_report(struct hid_device *device, enum hid_report_type type, unsigned int id, unsigned int application) { struct hid_report_enum *report_enum = device->report_enum + type; struct hid_report *report; if (id >= HID_MAX_IDS) return NULL; if (report_enum->report_id_hash[id]) return report_enum->report_id_hash[id]; report = kzalloc(sizeof(struct hid_report), GFP_KERNEL); if (!report) return NULL; if (id != 0) report_enum->numbered = 1; report->id = id; report->type = type; report->size = 0; report->device = device; report->application = application; report_enum->report_id_hash[id] = report; list_add_tail(&report->list, &report_enum->report_list); INIT_LIST_HEAD(&report->field_entry_list); return report; } EXPORT_SYMBOL_GPL(hid_register_report); /* * Register a new field for this report. */ static struct hid_field *hid_register_field(struct hid_report *report, unsigned usages) { struct hid_field *field; if (report->maxfield == HID_MAX_FIELDS) { hid_err(report->device, "too many fields in report\n"); return NULL; } field = kvzalloc((sizeof(struct hid_field) + usages * sizeof(struct hid_usage) + 3 * usages * sizeof(unsigned int)), GFP_KERNEL); if (!field) return NULL; field->index = report->maxfield++; report->field[field->index] = field; field->usage = (struct hid_usage *)(field + 1); field->value = (s32 *)(field->usage + usages); field->new_value = (s32 *)(field->value + usages); field->usages_priorities = (s32 *)(field->new_value + usages); field->report = report; return field; } /* * Open a collection. The type/usage is pushed on the stack. */ static int open_collection(struct hid_parser *parser, unsigned type) { struct hid_collection *collection; unsigned usage; int collection_index; usage = parser->local.usage[0]; if (parser->collection_stack_ptr == parser->collection_stack_size) { unsigned int *collection_stack; unsigned int new_size = parser->collection_stack_size + HID_COLLECTION_STACK_SIZE; collection_stack = krealloc(parser->collection_stack, new_size * sizeof(unsigned int), GFP_KERNEL); if (!collection_stack) return -ENOMEM; parser->collection_stack = collection_stack; parser->collection_stack_size = new_size; } if (parser->device->maxcollection == parser->device->collection_size) { collection = kmalloc( array3_size(sizeof(struct hid_collection), parser->device->collection_size, 2), GFP_KERNEL); if (collection == NULL) { hid_err(parser->device, "failed to reallocate collection array\n"); return -ENOMEM; } memcpy(collection, parser->device->collection, sizeof(struct hid_collection) * parser->device->collection_size); memset(collection + parser->device->collection_size, 0, sizeof(struct hid_collection) * parser->device->collection_size); kfree(parser->device->collection); parser->device->collection = collection; parser->device->collection_size *= 2; } parser->collection_stack[parser->collection_stack_ptr++] = parser->device->maxcollection; collection_index = parser->device->maxcollection++; collection = parser->device->collection + collection_index; collection->type = type; collection->usage = usage; collection->level = parser->collection_stack_ptr - 1; collection->parent_idx = (collection->level == 0) ? -1 : parser->collection_stack[collection->level - 1]; if (type == HID_COLLECTION_APPLICATION) parser->device->maxapplication++; return 0; } /* * Close a collection. */ static int close_collection(struct hid_parser *parser) { if (!parser->collection_stack_ptr) { hid_err(parser->device, "collection stack underflow\n"); return -EINVAL; } parser->collection_stack_ptr--; return 0; } /* * Climb up the stack, search for the specified collection type * and return the usage. */ static unsigned hid_lookup_collection(struct hid_parser *parser, unsigned type) { struct hid_collection *collection = parser->device->collection; int n; for (n = parser->collection_stack_ptr - 1; n >= 0; n--) { unsigned index = parser->collection_stack[n]; if (collection[index].type == type) return collection[index].usage; } return 0; /* we know nothing about this usage type */ } /* * Concatenate usage which defines 16 bits or less with the * currently defined usage page to form a 32 bit usage */ static void complete_usage(struct hid_parser *parser, unsigned int index) { parser->local.usage[index] &= 0xFFFF; parser->local.usage[index] |= (parser->global.usage_page & 0xFFFF) << 16; } /* * Add a usage to the temporary parser table. */ static int hid_add_usage(struct hid_parser *parser, unsigned usage, u8 size) { if (parser->local.usage_index >= HID_MAX_USAGES) { hid_err(parser->device, "usage index exceeded\n"); return -1; } parser->local.usage[parser->local.usage_index] = usage; /* * If Usage item only includes usage id, concatenate it with * currently defined usage page */ if (size <= 2) complete_usage(parser, parser->local.usage_index); parser->local.usage_size[parser->local.usage_index] = size; parser->local.collection_index[parser->local.usage_index] = parser->collection_stack_ptr ? parser->collection_stack[parser->collection_stack_ptr - 1] : 0; parser->local.usage_index++; return 0; } /* * Register a new field for this report. */ static int hid_add_field(struct hid_parser *parser, unsigned report_type, unsigned flags) { struct hid_report *report; struct hid_field *field; unsigned int max_buffer_size = HID_MAX_BUFFER_SIZE; unsigned int usages; unsigned int offset; unsigned int i; unsigned int application; application = hid_lookup_collection(parser, HID_COLLECTION_APPLICATION); report = hid_register_report(parser->device, report_type, parser->global.report_id, application); if (!report) { hid_err(parser->device, "hid_register_report failed\n"); return -1; } /* Handle both signed and unsigned cases properly */ if ((parser->global.logical_minimum < 0 && parser->global.logical_maximum < parser->global.logical_minimum) || (parser->global.logical_minimum >= 0 && (__u32)parser->global.logical_maximum < (__u32)parser->global.logical_minimum)) { dbg_hid("logical range invalid 0x%x 0x%x\n", parser->global.logical_minimum, parser->global.logical_maximum); return -1; } offset = report->size; report->size += parser->global.report_size * parser->global.report_count; if (parser->device->ll_driver->max_buffer_size) max_buffer_size = parser->device->ll_driver->max_buffer_size; /* Total size check: Allow for possible report index byte */ if (report->size > (max_buffer_size - 1) << 3) { hid_err(parser->device, "report is too long\n"); return -1; } if (!parser->local.usage_index) /* Ignore padding fields */ return 0; usages = max_t(unsigned, parser->local.usage_index, parser->global.report_count); field = hid_register_field(report, usages); if (!field) return 0; field->physical = hid_lookup_collection(parser, HID_COLLECTION_PHYSICAL); field->logical = hid_lookup_collection(parser, HID_COLLECTION_LOGICAL); field->application = application; for (i = 0; i < usages; i++) { unsigned j = i; /* Duplicate the last usage we parsed if we have excess values */ if (i >= parser->local.usage_index) j = parser->local.usage_index - 1; field->usage[i].hid = parser->local.usage[j]; field->usage[i].collection_index = parser->local.collection_index[j]; field->usage[i].usage_index = i; field->usage[i].resolution_multiplier = 1; } field->maxusage = usages; field->flags = flags; field->report_offset = offset; field->report_type = report_type; field->report_size = parser->global.report_size; field->report_count = parser->global.report_count; field->logical_minimum = parser->global.logical_minimum; field->logical_maximum = parser->global.logical_maximum; field->physical_minimum = parser->global.physical_minimum; field->physical_maximum = parser->global.physical_maximum; field->unit_exponent = parser->global.unit_exponent; field->unit = parser->global.unit; return 0; } /* * Read data value from item. */ static u32 item_udata(struct hid_item *item) { switch (item->size) { case 1: return item->data.u8; case 2: return item->data.u16; case 4: return item->data.u32; } return 0; } static s32 item_sdata(struct hid_item *item) { switch (item->size) { case 1: return item->data.s8; case 2: return item->data.s16; case 4: return item->data.s32; } return 0; } /* * Process a global item. */ static int hid_parser_global(struct hid_parser *parser, struct hid_item *item) { __s32 raw_value; switch (item->tag) { case HID_GLOBAL_ITEM_TAG_PUSH: if (parser->global_stack_ptr == HID_GLOBAL_STACK_SIZE) { hid_err(parser->device, "global environment stack overflow\n"); return -1; } memcpy(parser->global_stack + parser->global_stack_ptr++, &parser->global, sizeof(struct hid_global)); return 0; case HID_GLOBAL_ITEM_TAG_POP: if (!parser->global_stack_ptr) { hid_err(parser->device, "global environment stack underflow\n"); return -1; } memcpy(&parser->global, parser->global_stack + --parser->global_stack_ptr, sizeof(struct hid_global)); return 0; case HID_GLOBAL_ITEM_TAG_USAGE_PAGE: parser->global.usage_page = item_udata(item); return 0; case HID_GLOBAL_ITEM_TAG_LOGICAL_MINIMUM: parser->global.logical_minimum = item_sdata(item); return 0; case HID_GLOBAL_ITEM_TAG_LOGICAL_MAXIMUM: if (parser->global.logical_minimum < 0) parser->global.logical_maximum = item_sdata(item); else parser->global.logical_maximum = item_udata(item); return 0; case HID_GLOBAL_ITEM_TAG_PHYSICAL_MINIMUM: parser->global.physical_minimum = item_sdata(item); return 0; case HID_GLOBAL_ITEM_TAG_PHYSICAL_MAXIMUM: if (parser->global.physical_minimum < 0) parser->global.physical_maximum = item_sdata(item); else parser->global.physical_maximum = item_udata(item); return 0; case HID_GLOBAL_ITEM_TAG_UNIT_EXPONENT: /* Many devices provide unit exponent as a two's complement * nibble due to the common misunderstanding of HID * specification 1.11, 6.2.2.7 Global Items. Attempt to handle * both this and the standard encoding. */ raw_value = item_sdata(item); if (!(raw_value & 0xfffffff0)) parser->global.unit_exponent = snto32(raw_value, 4); else parser->global.unit_exponent = raw_value; return 0; case HID_GLOBAL_ITEM_TAG_UNIT: parser->global.unit = item_udata(item); return 0; case HID_GLOBAL_ITEM_TAG_REPORT_SIZE: parser->global.report_size = item_udata(item); if (parser->global.report_size > 256) { hid_err(parser->device, "invalid report_size %d\n", parser->global.report_size); return -1; } return 0; case HID_GLOBAL_ITEM_TAG_REPORT_COUNT: parser->global.report_count = item_udata(item); if (parser->global.report_count > HID_MAX_USAGES) { hid_err(parser->device, "invalid report_count %d\n", parser->global.report_count); return -1; } return 0; case HID_GLOBAL_ITEM_TAG_REPORT_ID: parser->global.report_id = item_udata(item); if (parser->global.report_id == 0 || parser->global.report_id >= HID_MAX_IDS) { hid_err(parser->device, "report_id %u is invalid\n", parser->global.report_id); return -1; } return 0; default: hid_err(parser->device, "unknown global tag 0x%x\n", item->tag); return -1; } } /* * Process a local item. */ static int hid_parser_local(struct hid_parser *parser, struct hid_item *item) { __u32 data; unsigned n; __u32 count; data = item_udata(item); switch (item->tag) { case HID_LOCAL_ITEM_TAG_DELIMITER: if (data) { /* * We treat items before the first delimiter * as global to all usage sets (branch 0). * In the moment we process only these global * items and the first delimiter set. */ if (parser->local.delimiter_depth != 0) { hid_err(parser->device, "nested delimiters\n"); return -1; } parser->local.delimiter_depth++; parser->local.delimiter_branch++; } else { if (parser->local.delimiter_depth < 1) { hid_err(parser->device, "bogus close delimiter\n"); return -1; } parser->local.delimiter_depth--; } return 0; case HID_LOCAL_ITEM_TAG_USAGE: if (parser->local.delimiter_branch > 1) { dbg_hid("alternative usage ignored\n"); return 0; } return hid_add_usage(parser, data, item->size); case HID_LOCAL_ITEM_TAG_USAGE_MINIMUM: if (parser->local.delimiter_branch > 1) { dbg_hid("alternative usage ignored\n"); return 0; } parser->local.usage_minimum = data; return 0; case HID_LOCAL_ITEM_TAG_USAGE_MAXIMUM: if (parser->local.delimiter_branch > 1) { dbg_hid("alternative usage ignored\n"); return 0; } count = data - parser->local.usage_minimum; if (count + parser->local.usage_index >= HID_MAX_USAGES) { /* * We do not warn if the name is not set, we are * actually pre-scanning the device. */ if (dev_name(&parser->device->dev)) hid_warn(parser->device, "ignoring exceeding usage max\n"); data = HID_MAX_USAGES - parser->local.usage_index + parser->local.usage_minimum - 1; if (data <= 0) { hid_err(parser->device, "no more usage index available\n"); return -1; } } for (n = parser->local.usage_minimum; n <= data; n++) if (hid_add_usage(parser, n, item->size)) { dbg_hid("hid_add_usage failed\n"); return -1; } return 0; default: dbg_hid("unknown local item tag 0x%x\n", item->tag); return 0; } return 0; } /* * Concatenate Usage Pages into Usages where relevant: * As per specification, 6.2.2.8: "When the parser encounters a main item it * concatenates the last declared Usage Page with a Usage to form a complete * usage value." */ static void hid_concatenate_last_usage_page(struct hid_parser *parser) { int i; unsigned int usage_page; unsigned int current_page; if (!parser->local.usage_index) return; usage_page = parser->global.usage_page; /* * Concatenate usage page again only if last declared Usage Page * has not been already used in previous usages concatenation */ for (i = parser->local.usage_index - 1; i >= 0; i--) { if (parser->local.usage_size[i] > 2) /* Ignore extended usages */ continue; current_page = parser->local.usage[i] >> 16; if (current_page == usage_page) break; complete_usage(parser, i); } } /* * Process a main item. */ static int hid_parser_main(struct hid_parser *parser, struct hid_item *item) { __u32 data; int ret; hid_concatenate_last_usage_page(parser); data = item_udata(item); switch (item->tag) { case HID_MAIN_ITEM_TAG_BEGIN_COLLECTION: ret = open_collection(parser, data & 0xff); break; case HID_MAIN_ITEM_TAG_END_COLLECTION: ret = close_collection(parser); break; case HID_MAIN_ITEM_TAG_INPUT: ret = hid_add_field(parser, HID_INPUT_REPORT, data); break; case HID_MAIN_ITEM_TAG_OUTPUT: ret = hid_add_field(parser, HID_OUTPUT_REPORT, data); break; case HID_MAIN_ITEM_TAG_FEATURE: ret = hid_add_field(parser, HID_FEATURE_REPORT, data); break; default: if (item->tag >= HID_MAIN_ITEM_TAG_RESERVED_MIN && item->tag <= HID_MAIN_ITEM_TAG_RESERVED_MAX) hid_warn_ratelimited(parser->device, "reserved main item tag 0x%x\n", item->tag); else hid_warn_ratelimited(parser->device, "unknown main item tag 0x%x\n", item->tag); ret = 0; } memset(&parser->local, 0, sizeof(parser->local)); /* Reset the local parser environment */ return ret; } /* * Process a reserved item. */ static int hid_parser_reserved(struct hid_parser *parser, struct hid_item *item) { dbg_hid("reserved item type, tag 0x%x\n", item->tag); return 0; } /* * Free a report and all registered fields. The field->usage and * field->value table's are allocated behind the field, so we need * only to free(field) itself. */ static void hid_free_report(struct hid_report *report) { unsigned n; kfree(report->field_entries); for (n = 0; n < report->maxfield; n++) kvfree(report->field[n]); kfree(report); } /* * Close report. This function returns the device * state to the point prior to hid_open_report(). */ static void hid_close_report(struct hid_device *device) { unsigned i, j; for (i = 0; i < HID_REPORT_TYPES; i++) { struct hid_report_enum *report_enum = device->report_enum + i; for (j = 0; j < HID_MAX_IDS; j++) { struct hid_report *report = report_enum->report_id_hash[j]; if (report) hid_free_report(report); } memset(report_enum, 0, sizeof(*report_enum)); INIT_LIST_HEAD(&report_enum->report_list); } /* * If the HID driver had a rdesc_fixup() callback, dev->rdesc * will be allocated by hid-core and needs to be freed. * Otherwise, it is either equal to dev_rdesc or bpf_rdesc, in * which cases it'll be freed later on device removal or destroy. */ if (device->rdesc != device->dev_rdesc && device->rdesc != device->bpf_rdesc) kfree(device->rdesc); device->rdesc = NULL; device->rsize = 0; kfree(device->collection); device->collection = NULL; device->collection_size = 0; device->maxcollection = 0; device->maxapplication = 0; device->status &= ~HID_STAT_PARSED; } static inline void hid_free_bpf_rdesc(struct hid_device *hdev) { /* bpf_rdesc is either equal to dev_rdesc or allocated by call_hid_bpf_rdesc_fixup() */ if (hdev->bpf_rdesc != hdev->dev_rdesc) kfree(hdev->bpf_rdesc); hdev->bpf_rdesc = NULL; } /* * Free a device structure, all reports, and all fields. */ void hiddev_free(struct kref *ref) { struct hid_device *hid = container_of(ref, struct hid_device, ref); hid_close_report(hid); hid_free_bpf_rdesc(hid); kfree(hid->dev_rdesc); kfree(hid); } static void hid_device_release(struct device *dev) { struct hid_device *hid = to_hid_device(dev); kref_put(&hid->ref, hiddev_free); } /* * Fetch a report description item from the data stream. We support long * items, though they are not used yet. */ static const u8 *fetch_item(const __u8 *start, const __u8 *end, struct hid_item *item) { u8 b; if ((end - start) <= 0) return NULL; b = *start++; item->type = (b >> 2) & 3; item->tag = (b >> 4) & 15; if (item->tag == HID_ITEM_TAG_LONG) { item->format = HID_ITEM_FORMAT_LONG; if ((end - start) < 2) return NULL; item->size = *start++; item->tag = *start++; if ((end - start) < item->size) return NULL; item->data.longdata = start; start += item->size; return start; } item->format = HID_ITEM_FORMAT_SHORT; item->size = BIT(b & 3) >> 1; /* 0, 1, 2, 3 -> 0, 1, 2, 4 */ if (end - start < item->size) return NULL; switch (item->size) { case 0: break; case 1: item->data.u8 = *start; break; case 2: item->data.u16 = get_unaligned_le16(start); break; case 4: item->data.u32 = get_unaligned_le32(start); break; } return start + item->size; } static void hid_scan_input_usage(struct hid_parser *parser, u32 usage) { struct hid_device *hid = parser->device; if (usage == HID_DG_CONTACTID) hid->group = HID_GROUP_MULTITOUCH; } static void hid_scan_feature_usage(struct hid_parser *parser, u32 usage) { if (usage == 0xff0000c5 && parser->global.report_count == 256 && parser->global.report_size == 8) parser->scan_flags |= HID_SCAN_FLAG_MT_WIN_8; if (usage == 0xff0000c6 && parser->global.report_count == 1 && parser->global.report_size == 8) parser->scan_flags |= HID_SCAN_FLAG_MT_WIN_8; } static void hid_scan_collection(struct hid_parser *parser, unsigned type) { struct hid_device *hid = parser->device; int i; if (((parser->global.usage_page << 16) == HID_UP_SENSOR) && (type == HID_COLLECTION_PHYSICAL || type == HID_COLLECTION_APPLICATION)) hid->group = HID_GROUP_SENSOR_HUB; if (hid->vendor == USB_VENDOR_ID_MICROSOFT && hid->product == USB_DEVICE_ID_MS_POWER_COVER && hid->group == HID_GROUP_MULTITOUCH) hid->group = HID_GROUP_GENERIC; if ((parser->global.usage_page << 16) == HID_UP_GENDESK) for (i = 0; i < parser->local.usage_index; i++) if (parser->local.usage[i] == HID_GD_POINTER) parser->scan_flags |= HID_SCAN_FLAG_GD_POINTER; if ((parser->global.usage_page << 16) >= HID_UP_MSVENDOR) parser->scan_flags |= HID_SCAN_FLAG_VENDOR_SPECIFIC; if ((parser->global.usage_page << 16) == HID_UP_GOOGLEVENDOR) for (i = 0; i < parser->local.usage_index; i++) if (parser->local.usage[i] == (HID_UP_GOOGLEVENDOR | 0x0001)) parser->device->group = HID_GROUP_VIVALDI; } static int hid_scan_main(struct hid_parser *parser, struct hid_item *item) { __u32 data; int i; hid_concatenate_last_usage_page(parser); data = item_udata(item); switch (item->tag) { case HID_MAIN_ITEM_TAG_BEGIN_COLLECTION: hid_scan_collection(parser, data & 0xff); break; case HID_MAIN_ITEM_TAG_END_COLLECTION: break; case HID_MAIN_ITEM_TAG_INPUT: /* ignore constant inputs, they will be ignored by hid-input */ if (data & HID_MAIN_ITEM_CONSTANT) break; for (i = 0; i < parser->local.usage_index; i++) hid_scan_input_usage(parser, parser->local.usage[i]); break; case HID_MAIN_ITEM_TAG_OUTPUT: break; case HID_MAIN_ITEM_TAG_FEATURE: for (i = 0; i < parser->local.usage_index; i++) hid_scan_feature_usage(parser, parser->local.usage[i]); break; } /* Reset the local parser environment */ memset(&parser->local, 0, sizeof(parser->local)); return 0; } /* * Scan a report descriptor before the device is added to the bus. * Sets device groups and other properties that determine what driver * to load. */ static int hid_scan_report(struct hid_device *hid) { struct hid_parser *parser; struct hid_item item; const __u8 *start = hid->dev_rdesc; const __u8 *end = start + hid->dev_rsize; static int (*dispatch_type[])(struct hid_parser *parser, struct hid_item *item) = { hid_scan_main, hid_parser_global, hid_parser_local, hid_parser_reserved }; parser = vzalloc(sizeof(struct hid_parser)); if (!parser) return -ENOMEM; parser->device = hid; hid->group = HID_GROUP_GENERIC; /* * In case we are re-scanning after a BPF has been loaded, * we need to use the bpf report descriptor, not the original one. */ if (hid->bpf_rdesc && hid->bpf_rsize) { start = hid->bpf_rdesc; end = start + hid->bpf_rsize; } /* * The parsing is simpler than the one in hid_open_report() as we should * be robust against hid errors. Those errors will be raised by * hid_open_report() anyway. */ while ((start = fetch_item(start, end, &item)) != NULL) dispatch_type[item.type](parser, &item); /* * Handle special flags set during scanning. */ if ((parser->scan_flags & HID_SCAN_FLAG_MT_WIN_8) && (hid->group == HID_GROUP_MULTITOUCH)) hid->group = HID_GROUP_MULTITOUCH_WIN_8; /* * Vendor specific handlings */ switch (hid->vendor) { case USB_VENDOR_ID_WACOM: hid->group = HID_GROUP_WACOM; break; case USB_VENDOR_ID_SYNAPTICS: if (hid->group == HID_GROUP_GENERIC) if ((parser->scan_flags & HID_SCAN_FLAG_VENDOR_SPECIFIC) && (parser->scan_flags & HID_SCAN_FLAG_GD_POINTER)) /* * hid-rmi should take care of them, * not hid-generic */ hid->group = HID_GROUP_RMI; break; } kfree(parser->collection_stack); vfree(parser); return 0; } /** * hid_parse_report - parse device report * * @hid: hid device * @start: report start * @size: report size * * Allocate the device report as read by the bus driver. This function should * only be called from parse() in ll drivers. */ int hid_parse_report(struct hid_device *hid, const __u8 *start, unsigned size) { hid->dev_rdesc = kmemdup(start, size, GFP_KERNEL); if (!hid->dev_rdesc) return -ENOMEM; hid->dev_rsize = size; return 0; } EXPORT_SYMBOL_GPL(hid_parse_report); static const char * const hid_report_names[] = { "HID_INPUT_REPORT", "HID_OUTPUT_REPORT", "HID_FEATURE_REPORT", }; /** * hid_validate_values - validate existing device report's value indexes * * @hid: hid device * @type: which report type to examine * @id: which report ID to examine (0 for first) * @field_index: which report field to examine * @report_counts: expected number of values * * Validate the number of values in a given field of a given report, after * parsing. */ struct hid_report *hid_validate_values(struct hid_device *hid, enum hid_report_type type, unsigned int id, unsigned int field_index, unsigned int report_counts) { struct hid_report *report; if (type > HID_FEATURE_REPORT) { hid_err(hid, "invalid HID report type %u\n", type); return NULL; } if (id >= HID_MAX_IDS) { hid_err(hid, "invalid HID report id %u\n", id); return NULL; } /* * Explicitly not using hid_get_report() here since it depends on * ->numbered being checked, which may not always be the case when * drivers go to access report values. */ if (id == 0) { /* * Validating on id 0 means we should examine the first * report in the list. */ report = list_first_entry_or_null( &hid->report_enum[type].report_list, struct hid_report, list); } else { report = hid->report_enum[type].report_id_hash[id]; } if (!report) { hid_err(hid, "missing %s %u\n", hid_report_names[type], id); return NULL; } if (report->maxfield <= field_index) { hid_err(hid, "not enough fields in %s %u\n", hid_report_names[type], id); return NULL; } if (report->field[field_index]->report_count < report_counts) { hid_err(hid, "not enough values in %s %u field %u\n", hid_report_names[type], id, field_index); return NULL; } return report; } EXPORT_SYMBOL_GPL(hid_validate_values); static int hid_calculate_multiplier(struct hid_device *hid, struct hid_field *multiplier) { int m; __s32 v = *multiplier->value; __s32 lmin = multiplier->logical_minimum; __s32 lmax = multiplier->logical_maximum; __s32 pmin = multiplier->physical_minimum; __s32 pmax = multiplier->physical_maximum; /* * "Because OS implementations will generally divide the control's * reported count by the Effective Resolution Multiplier, designers * should take care not to establish a potential Effective * Resolution Multiplier of zero." * HID Usage Table, v1.12, Section 4.3.1, p31 */ if (lmax - lmin == 0) return 1; /* * Handling the unit exponent is left as an exercise to whoever * finds a device where that exponent is not 0. */ m = ((v - lmin)/(lmax - lmin) * (pmax - pmin) + pmin); if (unlikely(multiplier->unit_exponent != 0)) { hid_warn(hid, "unsupported Resolution Multiplier unit exponent %d\n", multiplier->unit_exponent); } /* There are no devices with an effective multiplier > 255 */ if (unlikely(m == 0 || m > 255 || m < -255)) { hid_warn(hid, "unsupported Resolution Multiplier %d\n", m); m = 1; } return m; } static void hid_apply_multiplier_to_field(struct hid_device *hid, struct hid_field *field, struct hid_collection *multiplier_collection, int effective_multiplier) { struct hid_collection *collection; struct hid_usage *usage; int i; /* * If multiplier_collection is NULL, the multiplier applies * to all fields in the report. * Otherwise, it is the Logical Collection the multiplier applies to * but our field may be in a subcollection of that collection. */ for (i = 0; i < field->maxusage; i++) { usage = &field->usage[i]; collection = &hid->collection[usage->collection_index]; while (collection->parent_idx != -1 && collection != multiplier_collection) collection = &hid->collection[collection->parent_idx]; if (collection->parent_idx != -1 || multiplier_collection == NULL) usage->resolution_multiplier = effective_multiplier; } } static void hid_apply_multiplier(struct hid_device *hid, struct hid_field *multiplier) { struct hid_report_enum *rep_enum; struct hid_report *rep; struct hid_field *field; struct hid_collection *multiplier_collection; int effective_multiplier; int i; /* * "The Resolution Multiplier control must be contained in the same * Logical Collection as the control(s) to which it is to be applied. * If no Resolution Multiplier is defined, then the Resolution * Multiplier defaults to 1. If more than one control exists in a * Logical Collection, the Resolution Multiplier is associated with * all controls in the collection. If no Logical Collection is * defined, the Resolution Multiplier is associated with all * controls in the report." * HID Usage Table, v1.12, Section 4.3.1, p30 * * Thus, search from the current collection upwards until we find a * logical collection. Then search all fields for that same parent * collection. Those are the fields the multiplier applies to. * * If we have more than one multiplier, it will overwrite the * applicable fields later. */ multiplier_collection = &hid->collection[multiplier->usage->collection_index]; while (multiplier_collection->parent_idx != -1 && multiplier_collection->type != HID_COLLECTION_LOGICAL) multiplier_collection = &hid->collection[multiplier_collection->parent_idx]; if (multiplier_collection->type != HID_COLLECTION_LOGICAL) multiplier_collection = NULL; effective_multiplier = hid_calculate_multiplier(hid, multiplier); rep_enum = &hid->report_enum[HID_INPUT_REPORT]; list_for_each_entry(rep, &rep_enum->report_list, list) { for (i = 0; i < rep->maxfield; i++) { field = rep->field[i]; hid_apply_multiplier_to_field(hid, field, multiplier_collection, effective_multiplier); } } } /* * hid_setup_resolution_multiplier - set up all resolution multipliers * * @device: hid device * * Search for all Resolution Multiplier Feature Reports and apply their * value to all matching Input items. This only updates the internal struct * fields. * * The Resolution Multiplier is applied by the hardware. If the multiplier * is anything other than 1, the hardware will send pre-multiplied events * so that the same physical interaction generates an accumulated * accumulated_value = value * * multiplier * This may be achieved by sending * - "value * multiplier" for each event, or * - "value" but "multiplier" times as frequently, or * - a combination of the above * The only guarantee is that the same physical interaction always generates * an accumulated 'value * multiplier'. * * This function must be called before any event processing and after * any SetRequest to the Resolution Multiplier. */ void hid_setup_resolution_multiplier(struct hid_device *hid) { struct hid_report_enum *rep_enum; struct hid_report *rep; struct hid_usage *usage; int i, j; rep_enum = &hid->report_enum[HID_FEATURE_REPORT]; list_for_each_entry(rep, &rep_enum->report_list, list) { for (i = 0; i < rep->maxfield; i++) { /* Ignore if report count is out of bounds. */ if (rep->field[i]->report_count < 1) continue; for (j = 0; j < rep->field[i]->maxusage; j++) { usage = &rep->field[i]->usage[j]; if (usage->hid == HID_GD_RESOLUTION_MULTIPLIER) hid_apply_multiplier(hid, rep->field[i]); } } } } EXPORT_SYMBOL_GPL(hid_setup_resolution_multiplier); /** * hid_open_report - open a driver-specific device report * * @device: hid device * * Parse a report description into a hid_device structure. Reports are * enumerated, fields are attached to these reports. * 0 returned on success, otherwise nonzero error value. * * This function (or the equivalent hid_parse() macro) should only be * called from probe() in drivers, before starting the device. */ int hid_open_report(struct hid_device *device) { struct hid_parser *parser; struct hid_item item; unsigned int size; const __u8 *start; const __u8 *end; const __u8 *next; int ret; int i; static int (*dispatch_type[])(struct hid_parser *parser, struct hid_item *item) = { hid_parser_main, hid_parser_global, hid_parser_local, hid_parser_reserved }; if (WARN_ON(device->status & HID_STAT_PARSED)) return -EBUSY; start = device->bpf_rdesc; if (WARN_ON(!start)) return -ENODEV; size = device->bpf_rsize; if (device->driver->report_fixup) { /* * device->driver->report_fixup() needs to work * on a copy of our report descriptor so it can * change it. */ __u8 *buf = kmemdup(start, size, GFP_KERNEL); if (buf == NULL) return -ENOMEM; start = device->driver->report_fixup(device, buf, &size); /* * The second kmemdup is required in case report_fixup() returns * a static read-only memory, but we have no idea if that memory * needs to be cleaned up or not at the end. */ start = kmemdup(start, size, GFP_KERNEL); kfree(buf); if (start == NULL) return -ENOMEM; } device->rdesc = start; device->rsize = size; parser = vzalloc(sizeof(struct hid_parser)); if (!parser) { ret = -ENOMEM; goto alloc_err; } parser->device = device; end = start + size; device->collection = kcalloc(HID_DEFAULT_NUM_COLLECTIONS, sizeof(struct hid_collection), GFP_KERNEL); if (!device->collection) { ret = -ENOMEM; goto err; } device->collection_size = HID_DEFAULT_NUM_COLLECTIONS; for (i = 0; i < HID_DEFAULT_NUM_COLLECTIONS; i++) device->collection[i].parent_idx = -1; ret = -EINVAL; while ((next = fetch_item(start, end, &item)) != NULL) { start = next; if (item.format != HID_ITEM_FORMAT_SHORT) { hid_err(device, "unexpected long global item\n"); goto err; } if (dispatch_type[item.type](parser, &item)) { hid_err(device, "item %u %u %u %u parsing failed\n", item.format, (unsigned)item.size, (unsigned)item.type, (unsigned)item.tag); goto err; } if (start == end) { if (parser->collection_stack_ptr) { hid_err(device, "unbalanced collection at end of report description\n"); goto err; } if (parser->local.delimiter_depth) { hid_err(device, "unbalanced delimiter at end of report description\n"); goto err; } /* * fetch initial values in case the device's * default multiplier isn't the recommended 1 */ hid_setup_resolution_multiplier(device); kfree(parser->collection_stack); vfree(parser); device->status |= HID_STAT_PARSED; return 0; } } hid_err(device, "item fetching failed at offset %u/%u\n", size - (unsigned int)(end - start), size); err: kfree(parser->collection_stack); alloc_err: vfree(parser); hid_close_report(device); return ret; } EXPORT_SYMBOL_GPL(hid_open_report); /* * Extract/implement a data field from/to a little endian report (bit array). * * Code sort-of follows HID spec: * http://www.usb.org/developers/hidpage/HID1_11.pdf * * While the USB HID spec allows unlimited length bit fields in "report * descriptors", most devices never use more than 16 bits. * One model of UPS is claimed to report "LINEV" as a 32-bit field. * Search linux-kernel and linux-usb-devel archives for "hid-core extract". */ static u32 __extract(u8 *report, unsigned offset, int n) { unsigned int idx = offset / 8; unsigned int bit_nr = 0; unsigned int bit_shift = offset % 8; int bits_to_copy = 8 - bit_shift; u32 value = 0; u32 mask = n < 32 ? (1U << n) - 1 : ~0U; while (n > 0) { value |= ((u32)report[idx] >> bit_shift) << bit_nr; n -= bits_to_copy; bit_nr += bits_to_copy; bits_to_copy = 8; bit_shift = 0; idx++; } return value & mask; } u32 hid_field_extract(const struct hid_device *hid, u8 *report, unsigned offset, unsigned n) { if (n > 32) { hid_warn_once(hid, "%s() called with n (%d) > 32! (%s)\n", __func__, n, current->comm); n = 32; } return __extract(report, offset, n); } EXPORT_SYMBOL_GPL(hid_field_extract); /* * "implement" : set bits in a little endian bit stream. * Same concepts as "extract" (see comments above). * The data mangled in the bit stream remains in little endian * order the whole time. It make more sense to talk about * endianness of register values by considering a register * a "cached" copy of the little endian bit stream. */ static void __implement(u8 *report, unsigned offset, int n, u32 value) { unsigned int idx = offset / 8; unsigned int bit_shift = offset % 8; int bits_to_set = 8 - bit_shift; while (n - bits_to_set >= 0) { report[idx] &= ~(0xff << bit_shift); report[idx] |= value << bit_shift; value >>= bits_to_set; n -= bits_to_set; bits_to_set = 8; bit_shift = 0; idx++; } /* last nibble */ if (n) { u8 bit_mask = ((1U << n) - 1); report[idx] &= ~(bit_mask << bit_shift); report[idx] |= value << bit_shift; } } static void implement(const struct hid_device *hid, u8 *report, unsigned offset, unsigned n, u32 value) { if (unlikely(n > 32)) { hid_warn(hid, "%s() called with n (%d) > 32! (%s)\n", __func__, n, current->comm); n = 32; } else if (n < 32) { u32 m = (1U << n) - 1; if (unlikely(value > m)) { hid_warn(hid, "%s() called with too large value %d (n: %d)! (%s)\n", __func__, value, n, current->comm); value &= m; } } __implement(report, offset, n, value); } /* * Search an array for a value. */ static int search(__s32 *array, __s32 value, unsigned n) { while (n--) { if (*array++ == value) return 0; } return -1; } /** * hid_match_report - check if driver's raw_event should be called * * @hid: hid device * @report: hid report to match against * * compare hid->driver->report_table->report_type to report->type */ static int hid_match_report(struct hid_device *hid, struct hid_report *report) { const struct hid_report_id *id = hid->driver->report_table; if (!id) /* NULL means all */ return 1; for (; id->report_type != HID_TERMINATOR; id++) if (id->report_type == HID_ANY_ID || id->report_type == report->type) return 1; return 0; } /** * hid_match_usage - check if driver's event should be called * * @hid: hid device * @usage: usage to match against * * compare hid->driver->usage_table->usage_{type,code} to * usage->usage_{type,code} */ static int hid_match_usage(struct hid_device *hid, struct hid_usage *usage) { const struct hid_usage_id *id = hid->driver->usage_table; if (!id) /* NULL means all */ return 1; for (; id->usage_type != HID_ANY_ID - 1; id++) if ((id->usage_hid == HID_ANY_ID || id->usage_hid == usage->hid) && (id->usage_type == HID_ANY_ID || id->usage_type == usage->type) && (id->usage_code == HID_ANY_ID || id->usage_code == usage->code)) return 1; return 0; } static void hid_process_event(struct hid_device *hid, struct hid_field *field, struct hid_usage *usage, __s32 value, int interrupt) { struct hid_driver *hdrv = hid->driver; int ret; if (!list_empty(&hid->debug_list)) hid_dump_input(hid, usage, value); if (hdrv && hdrv->event && hid_match_usage(hid, usage)) { ret = hdrv->event(hid, field, usage, value); if (ret != 0) { if (ret < 0) hid_err(hid, "%s's event failed with %d\n", hdrv->name, ret); return; } } if (hid->claimed & HID_CLAIMED_INPUT) hidinput_hid_event(hid, field, usage, value); if (hid->claimed & HID_CLAIMED_HIDDEV && interrupt && hid->hiddev_hid_event) hid->hiddev_hid_event(hid, field, usage, value); } /* * Checks if the given value is valid within this field */ static inline int hid_array_value_is_valid(struct hid_field *field, __s32 value) { __s32 min = field->logical_minimum; /* * Value needs to be between logical min and max, and * (value - min) is used as an index in the usage array. * This array is of size field->maxusage */ return value >= min && value <= field->logical_maximum && value - min < field->maxusage; } /* * Fetch the field from the data. The field content is stored for next * report processing (we do differential reporting to the layer). */ static void hid_input_fetch_field(struct hid_device *hid, struct hid_field *field, __u8 *data) { unsigned n; unsigned count = field->report_count; unsigned offset = field->report_offset; unsigned size = field->report_size; __s32 min = field->logical_minimum; __s32 *value; value = field->new_value; memset(value, 0, count * sizeof(__s32)); field->ignored = false; for (n = 0; n < count; n++) { value[n] = min < 0 ? snto32(hid_field_extract(hid, data, offset + n * size, size), size) : hid_field_extract(hid, data, offset + n * size, size); /* Ignore report if ErrorRollOver */ if (!(field->flags & HID_MAIN_ITEM_VARIABLE) && hid_array_value_is_valid(field, value[n]) && field->usage[value[n] - min].hid == HID_UP_KEYBOARD + 1) { field->ignored = true; return; } } } /* * Process a received variable field. */ static void hid_input_var_field(struct hid_device *hid, struct hid_field *field, int interrupt) { unsigned int count = field->report_count; __s32 *value = field->new_value; unsigned int n; for (n = 0; n < count; n++) hid_process_event(hid, field, &field->usage[n], value[n], interrupt); memcpy(field->value, value, count * sizeof(__s32)); } /* * Process a received array field. The field content is stored for * next report processing (we do differential reporting to the layer). */ static void hid_input_array_field(struct hid_device *hid, struct hid_field *field, int interrupt) { unsigned int n; unsigned int count = field->report_count; __s32 min = field->logical_minimum; __s32 *value; value = field->new_value; /* ErrorRollOver */ if (field->ignored) return; for (n = 0; n < count; n++) { if (hid_array_value_is_valid(field, field->value[n]) && search(value, field->value[n], count)) hid_process_event(hid, field, &field->usage[field->value[n] - min], 0, interrupt); if (hid_array_value_is_valid(field, value[n]) && search(field->value, value[n], count)) hid_process_event(hid, field, &field->usage[value[n] - min], 1, interrupt); } memcpy(field->value, value, count * sizeof(__s32)); } /* * Analyse a received report, and fetch the data from it. The field * content is stored for next report processing (we do differential * reporting to the layer). */ static void hid_process_report(struct hid_device *hid, struct hid_report *report, __u8 *data, int interrupt) { unsigned int a; struct hid_field_entry *entry; struct hid_field *field; /* first retrieve all incoming values in data */ for (a = 0; a < report->maxfield; a++) hid_input_fetch_field(hid, report->field[a], data); if (!list_empty(&report->field_entry_list)) { /* INPUT_REPORT, we have a priority list of fields */ list_for_each_entry(entry, &report->field_entry_list, list) { field = entry->field; if (field->flags & HID_MAIN_ITEM_VARIABLE) hid_process_event(hid, field, &field->usage[entry->index], field->new_value[entry->index], interrupt); else hid_input_array_field(hid, field, interrupt); } /* we need to do the memcpy at the end for var items */ for (a = 0; a < report->maxfield; a++) { field = report->field[a]; if (field->flags & HID_MAIN_ITEM_VARIABLE) memcpy(field->value, field->new_value, field->report_count * sizeof(__s32)); } } else { /* FEATURE_REPORT, regular processing */ for (a = 0; a < report->maxfield; a++) { field = report->field[a]; if (field->flags & HID_MAIN_ITEM_VARIABLE) hid_input_var_field(hid, field, interrupt); else hid_input_array_field(hid, field, interrupt); } } } /* * Insert a given usage_index in a field in the list * of processed usages in the report. * * The elements of lower priority score are processed * first. */ static void __hid_insert_field_entry(struct hid_device *hid, struct hid_report *report, struct hid_field_entry *entry, struct hid_field *field, unsigned int usage_index) { struct hid_field_entry *next; entry->field = field; entry->index = usage_index; entry->priority = field->usages_priorities[usage_index]; /* insert the element at the correct position */ list_for_each_entry(next, &report->field_entry_list, list) { /* * the priority of our element is strictly higher * than the next one, insert it before */ if (entry->priority > next->priority) { list_add_tail(&entry->list, &next->list); return; } } /* lowest priority score: insert at the end */ list_add_tail(&entry->list, &report->field_entry_list); } static void hid_report_process_ordering(struct hid_device *hid, struct hid_report *report) { struct hid_field *field; struct hid_field_entry *entries; unsigned int a, u, usages; unsigned int count = 0; /* count the number of individual fields in the report */ for (a = 0; a < report->maxfield; a++) { field = report->field[a]; if (field->flags & HID_MAIN_ITEM_VARIABLE) count += field->report_count; else count++; } /* allocate the memory to process the fields */ entries = kcalloc(count, sizeof(*entries), GFP_KERNEL); if (!entries) return; report->field_entries = entries; /* * walk through all fields in the report and * store them by priority order in report->field_entry_list * * - Var elements are individualized (field + usage_index) * - Arrays are taken as one, we can not chose an order for them */ usages = 0; for (a = 0; a < report->maxfield; a++) { field = report->field[a]; if (field->flags & HID_MAIN_ITEM_VARIABLE) { for (u = 0; u < field->report_count; u++) { __hid_insert_field_entry(hid, report, &entries[usages], field, u); usages++; } } else { __hid_insert_field_entry(hid, report, &entries[usages], field, 0); usages++; } } } static void hid_process_ordering(struct hid_device *hid) { struct hid_report *report; struct hid_report_enum *report_enum = &hid->report_enum[HID_INPUT_REPORT]; list_for_each_entry(report, &report_enum->report_list, list) hid_report_process_ordering(hid, report); } /* * Output the field into the report. */ static void hid_output_field(const struct hid_device *hid, struct hid_field *field, __u8 *data) { unsigned count = field->report_count; unsigned offset = field->report_offset; unsigned size = field->report_size; unsigned n; for (n = 0; n < count; n++) { if (field->logical_minimum < 0) /* signed values */ implement(hid, data, offset + n * size, size, s32ton(field->value[n], size)); else /* unsigned values */ implement(hid, data, offset + n * size, size, field->value[n]); } } /* * Compute the size of a report. */ static size_t hid_compute_report_size(struct hid_report *report) { if (report->size) return ((report->size - 1) >> 3) + 1; return 0; } /* * Create a report. 'data' has to be allocated using * hid_alloc_report_buf() so that it has proper size. */ void hid_output_report(struct hid_report *report, __u8 *data) { unsigned n; if (report->id > 0) *data++ = report->id; memset(data, 0, hid_compute_report_size(report)); for (n = 0; n < report->maxfield; n++) hid_output_field(report->device, report->field[n], data); } EXPORT_SYMBOL_GPL(hid_output_report); /* * Allocator for buffer that is going to be passed to hid_output_report() */ u8 *hid_alloc_report_buf(struct hid_report *report, gfp_t flags) { /* * 7 extra bytes are necessary to achieve proper functionality * of implement() working on 8 byte chunks * 1 extra byte for the report ID if it is null (not used) so * we can reserve that extra byte in the first position of the buffer * when sending it to .raw_request() */ u32 len = hid_report_len(report) + 7 + (report->id == 0); return kzalloc(len, flags); } EXPORT_SYMBOL_GPL(hid_alloc_report_buf); /* * Set a field value. The report this field belongs to has to be * created and transferred to the device, to set this value in the * device. */ int hid_set_field(struct hid_field *field, unsigned offset, __s32 value) { unsigned size; if (!field) return -1; size = field->report_size; hid_dump_input(field->report->device, field->usage + offset, value); if (offset >= field->report_count) { hid_err(field->report->device, "offset (%d) exceeds report_count (%d)\n", offset, field->report_count); return -1; } if (field->logical_minimum < 0) { if (value != snto32(s32ton(value, size), size)) { hid_err(field->report->device, "value %d is out of range\n", value); return -1; } } field->value[offset] = value; return 0; } EXPORT_SYMBOL_GPL(hid_set_field); struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_type, unsigned int application, unsigned int usage) { struct list_head *report_list = &hdev->report_enum[report_type].report_list; struct hid_report *report; int i, j; list_for_each_entry(report, report_list, list) { if (report->application != application) continue; for (i = 0; i < report->maxfield; i++) { struct hid_field *field = report->field[i]; for (j = 0; j < field->maxusage; j++) { if (field->usage[j].hid == usage) return field; } } } return NULL; } EXPORT_SYMBOL_GPL(hid_find_field); static struct hid_report *hid_get_report(struct hid_report_enum *report_enum, const u8 *data) { struct hid_report *report; unsigned int n = 0; /* Normally report number is 0 */ /* Device uses numbered reports, data[0] is report number */ if (report_enum->numbered) n = *data; report = report_enum->report_id_hash[n]; if (report == NULL) dbg_hid("undefined report_id %u received\n", n); return report; } /* * Implement a generic .request() callback, using .raw_request() * DO NOT USE in hid drivers directly, but through hid_hw_request instead. */ int __hid_request(struct hid_device *hid, struct hid_report *report, enum hid_class_request reqtype) { char *buf, *data_buf; int ret; u32 len; buf = hid_alloc_report_buf(report, GFP_KERNEL); if (!buf) return -ENOMEM; data_buf = buf; len = hid_report_len(report); if (report->id == 0) { /* reserve the first byte for the report ID */ data_buf++; len++; } if (reqtype == HID_REQ_SET_REPORT) hid_output_report(report, data_buf); ret = hid_hw_raw_request(hid, report->id, buf, len, report->type, reqtype); if (ret < 0) { dbg_hid("unable to complete request: %d\n", ret); goto out; } if (reqtype == HID_REQ_GET_REPORT) hid_input_report(hid, report->type, buf, ret, 0); ret = 0; out: kfree(buf); return ret; } EXPORT_SYMBOL_GPL(__hid_request); int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt) { struct hid_report_enum *report_enum = hid->report_enum + type; struct hid_report *report; struct hid_driver *hdrv; int max_buffer_size = HID_MAX_BUFFER_SIZE; u32 rsize, csize = size; u8 *cdata = data; int ret = 0; report = hid_get_report(report_enum, data); if (!report) goto out; if (report_enum->numbered) { cdata++; csize--; } rsize = hid_compute_report_size(report); if (hid->ll_driver->max_buffer_size) max_buffer_size = hid->ll_driver->max_buffer_size; if (report_enum->numbered && rsize >= max_buffer_size) rsize = max_buffer_size - 1; else if (rsize > max_buffer_size) rsize = max_buffer_size; if (csize < rsize) { dbg_hid("report %d is too short, (%d < %d)\n", report->id, csize, rsize); memset(cdata + csize, 0, rsize - csize); } if ((hid->claimed & HID_CLAIMED_HIDDEV) && hid->hiddev_report_event) hid->hiddev_report_event(hid, report); if (hid->claimed & HID_CLAIMED_HIDRAW) { ret = hidraw_report_event(hid, data, size); if (ret) goto out; } if (hid->claimed != HID_CLAIMED_HIDRAW && report->maxfield) { hid_process_report(hid, report, cdata, interrupt); hdrv = hid->driver; if (hdrv && hdrv->report) hdrv->report(hid, report); } if (hid->claimed & HID_CLAIMED_INPUT) hidinput_report_event(hid, report); out: return ret; } EXPORT_SYMBOL_GPL(hid_report_raw_event); static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt, u64 source, bool from_bpf, bool lock_already_taken) { struct hid_report_enum *report_enum; struct hid_driver *hdrv; struct hid_report *report; int ret = 0; if (!hid) return -ENODEV; ret = down_trylock(&hid->driver_input_lock); if (lock_already_taken && !ret) { up(&hid->driver_input_lock); return -EINVAL; } else if (!lock_already_taken && ret) { return -EBUSY; } if (!hid->driver) { ret = -ENODEV; goto unlock; } report_enum = hid->report_enum + type; hdrv = hid->driver; data = dispatch_hid_bpf_device_event(hid, type, data, &size, interrupt, source, from_bpf); if (IS_ERR(data)) { ret = PTR_ERR(data); goto unlock; } if (!size) { dbg_hid("empty report\n"); ret = -1; goto unlock; } /* Avoid unnecessary overhead if debugfs is disabled */ if (!list_empty(&hid->debug_list)) hid_dump_report(hid, type, data, size); report = hid_get_report(report_enum, data); if (!report) { ret = -1; goto unlock; } if (hdrv && hdrv->raw_event && hid_match_report(hid, report)) { ret = hdrv->raw_event(hid, report, data, size); if (ret < 0) goto unlock; } ret = hid_report_raw_event(hid, type, data, size, interrupt); unlock: if (!lock_already_taken) up(&hid->driver_input_lock); return ret; } /** * hid_input_report - report data from lower layer (usb, bt...) * * @hid: hid device * @type: HID report type (HID_*_REPORT) * @data: report contents * @size: size of data parameter * @interrupt: distinguish between interrupt and control transfers * * This is data entry for lower layers. */ int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt) { return __hid_input_report(hid, type, data, size, interrupt, 0, false, /* from_bpf */ false /* lock_already_taken */); } EXPORT_SYMBOL_GPL(hid_input_report); bool hid_match_one_id(const struct hid_device *hdev, const struct hid_device_id *id) { return (id->bus == HID_BUS_ANY || id->bus == hdev->bus) && (id->group == HID_GROUP_ANY || id->group == hdev->group) && (id->vendor == HID_ANY_ID || id->vendor == hdev->vendor) && (id->product == HID_ANY_ID || id->product == hdev->product); } const struct hid_device_id *hid_match_id(const struct hid_device *hdev, const struct hid_device_id *id) { for (; id->bus; id++) if (hid_match_one_id(hdev, id)) return id; return NULL; } EXPORT_SYMBOL_GPL(hid_match_id); static const struct hid_device_id hid_hiddev_list[] = { { HID_USB_DEVICE(USB_VENDOR_ID_MGE, USB_DEVICE_ID_MGE_UPS) }, { HID_USB_DEVICE(USB_VENDOR_ID_MGE, USB_DEVICE_ID_MGE_UPS1) }, { } }; static bool hid_hiddev(struct hid_device *hdev) { return !!hid_match_id(hdev, hid_hiddev_list); } static ssize_t report_descriptor_read(struct file *filp, struct kobject *kobj, const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj); struct hid_device *hdev = to_hid_device(dev); if (off >= hdev->rsize) return 0; if (off + count > hdev->rsize) count = hdev->rsize - off; memcpy(buf, hdev->rdesc + off, count); return count; } static ssize_t country_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); return sprintf(buf, "%02x\n", hdev->country & 0xff); } static const BIN_ATTR_RO(report_descriptor, HID_MAX_DESCRIPTOR_SIZE); static const DEVICE_ATTR_RO(country); int hid_connect(struct hid_device *hdev, unsigned int connect_mask) { static const char *types[] = { "Device", "Pointer", "Mouse", "Device", "Joystick", "Gamepad", "Keyboard", "Keypad", "Multi-Axis Controller" }; const char *type, *bus; char buf[64] = ""; unsigned int i; int len; int ret; ret = hid_bpf_connect_device(hdev); if (ret) return ret; if (hdev->quirks & HID_QUIRK_HIDDEV_FORCE) connect_mask |= (HID_CONNECT_HIDDEV_FORCE | HID_CONNECT_HIDDEV); if (hdev->quirks & HID_QUIRK_HIDINPUT_FORCE) connect_mask |= HID_CONNECT_HIDINPUT_FORCE; if (hdev->bus != BUS_USB) connect_mask &= ~HID_CONNECT_HIDDEV; if (hid_hiddev(hdev)) connect_mask |= HID_CONNECT_HIDDEV_FORCE; if ((connect_mask & HID_CONNECT_HIDINPUT) && !hidinput_connect(hdev, connect_mask & HID_CONNECT_HIDINPUT_FORCE)) hdev->claimed |= HID_CLAIMED_INPUT; if ((connect_mask & HID_CONNECT_HIDDEV) && hdev->hiddev_connect && !hdev->hiddev_connect(hdev, connect_mask & HID_CONNECT_HIDDEV_FORCE)) hdev->claimed |= HID_CLAIMED_HIDDEV; if ((connect_mask & HID_CONNECT_HIDRAW) && !hidraw_connect(hdev)) hdev->claimed |= HID_CLAIMED_HIDRAW; if (connect_mask & HID_CONNECT_DRIVER) hdev->claimed |= HID_CLAIMED_DRIVER; /* Drivers with the ->raw_event callback set are not required to connect * to any other listener. */ if (!hdev->claimed && !hdev->driver->raw_event) { hid_err(hdev, "device has no listeners, quitting\n"); return -ENODEV; } hid_process_ordering(hdev); if ((hdev->claimed & HID_CLAIMED_INPUT) && (connect_mask & HID_CONNECT_FF) && hdev->ff_init) hdev->ff_init(hdev); len = 0; if (hdev->claimed & HID_CLAIMED_INPUT) len += sprintf(buf + len, "input"); if (hdev->claimed & HID_CLAIMED_HIDDEV) len += sprintf(buf + len, "%shiddev%d", len ? "," : "", ((struct hiddev *)hdev->hiddev)->minor); if (hdev->claimed & HID_CLAIMED_HIDRAW) len += sprintf(buf + len, "%shidraw%d", len ? "," : "", ((struct hidraw *)hdev->hidraw)->minor); type = "Device"; for (i = 0; i < hdev->maxcollection; i++) { struct hid_collection *col = &hdev->collection[i]; if (col->type == HID_COLLECTION_APPLICATION && (col->usage & HID_USAGE_PAGE) == HID_UP_GENDESK && (col->usage & 0xffff) < ARRAY_SIZE(types)) { type = types[col->usage & 0xffff]; break; } } switch (hdev->bus) { case BUS_USB: bus = "USB"; break; case BUS_BLUETOOTH: bus = "BLUETOOTH"; break; case BUS_I2C: bus = "I2C"; break; case BUS_SDW: bus = "SOUNDWIRE"; break; case BUS_VIRTUAL: bus = "VIRTUAL"; break; case BUS_INTEL_ISHTP: case BUS_AMD_SFH: bus = "SENSOR HUB"; break; default: bus = "<UNKNOWN>"; } ret = device_create_file(&hdev->dev, &dev_attr_country); if (ret) hid_warn(hdev, "can't create sysfs country code attribute err: %d\n", ret); hid_info(hdev, "%s: %s HID v%x.%02x %s [%s] on %s\n", buf, bus, hdev->version >> 8, hdev->version & 0xff, type, hdev->name, hdev->phys); return 0; } EXPORT_SYMBOL_GPL(hid_connect); void hid_disconnect(struct hid_device *hdev) { device_remove_file(&hdev->dev, &dev_attr_country); if (hdev->claimed & HID_CLAIMED_INPUT) hidinput_disconnect(hdev); if (hdev->claimed & HID_CLAIMED_HIDDEV) hdev->hiddev_disconnect(hdev); if (hdev->claimed & HID_CLAIMED_HIDRAW) hidraw_disconnect(hdev); hdev->claimed = 0; hid_bpf_disconnect_device(hdev); } EXPORT_SYMBOL_GPL(hid_disconnect); /** * hid_hw_start - start underlying HW * @hdev: hid device * @connect_mask: which outputs to connect, see HID_CONNECT_* * * Call this in probe function *after* hid_parse. This will setup HW * buffers and start the device (if not defeirred to device open). * hid_hw_stop must be called if this was successful. */ int hid_hw_start(struct hid_device *hdev, unsigned int connect_mask) { int error; error = hdev->ll_driver->start(hdev); if (error) return error; if (connect_mask) { error = hid_connect(hdev, connect_mask); if (error) { hdev->ll_driver->stop(hdev); return error; } } return 0; } EXPORT_SYMBOL_GPL(hid_hw_start); /** * hid_hw_stop - stop underlying HW * @hdev: hid device * * This is usually called from remove function or from probe when something * failed and hid_hw_start was called already. */ void hid_hw_stop(struct hid_device *hdev) { hid_disconnect(hdev); hdev->ll_driver->stop(hdev); } EXPORT_SYMBOL_GPL(hid_hw_stop); /** * hid_hw_open - signal underlying HW to start delivering events * @hdev: hid device * * Tell underlying HW to start delivering events from the device. * This function should be called sometime after successful call * to hid_hw_start(). */ int hid_hw_open(struct hid_device *hdev) { int ret; ret = mutex_lock_killable(&hdev->ll_open_lock); if (ret) return ret; if (!hdev->ll_open_count++) { ret = hdev->ll_driver->open(hdev); if (ret) hdev->ll_open_count--; if (hdev->driver->on_hid_hw_open) hdev->driver->on_hid_hw_open(hdev); } mutex_unlock(&hdev->ll_open_lock); return ret; } EXPORT_SYMBOL_GPL(hid_hw_open); /** * hid_hw_close - signal underlaying HW to stop delivering events * * @hdev: hid device * * This function indicates that we are not interested in the events * from this device anymore. Delivery of events may or may not stop, * depending on the number of users still outstanding. */ void hid_hw_close(struct hid_device *hdev) { mutex_lock(&hdev->ll_open_lock); if (!--hdev->ll_open_count) { hdev->ll_driver->close(hdev); if (hdev->driver->on_hid_hw_close) hdev->driver->on_hid_hw_close(hdev); } mutex_unlock(&hdev->ll_open_lock); } EXPORT_SYMBOL_GPL(hid_hw_close); /** * hid_hw_request - send report request to device * * @hdev: hid device * @report: report to send * @reqtype: hid request type */ void hid_hw_request(struct hid_device *hdev, struct hid_report *report, enum hid_class_request reqtype) { if (hdev->ll_driver->request) return hdev->ll_driver->request(hdev, report, reqtype); __hid_request(hdev, report, reqtype); } EXPORT_SYMBOL_GPL(hid_hw_request); int __hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, enum hid_class_request reqtype, u64 source, bool from_bpf) { unsigned int max_buffer_size = HID_MAX_BUFFER_SIZE; int ret; if (hdev->ll_driver->max_buffer_size) max_buffer_size = hdev->ll_driver->max_buffer_size; if (len < 1 || len > max_buffer_size || !buf) return -EINVAL; ret = dispatch_hid_bpf_raw_requests(hdev, reportnum, buf, len, rtype, reqtype, source, from_bpf); if (ret) return ret; return hdev->ll_driver->raw_request(hdev, reportnum, buf, len, rtype, reqtype); } /** * hid_hw_raw_request - send report request to device * * @hdev: hid device * @reportnum: report ID * @buf: in/out data to transfer * @len: length of buf * @rtype: HID report type * @reqtype: HID_REQ_GET_REPORT or HID_REQ_SET_REPORT * * Return: count of data transferred, negative if error * * Same behavior as hid_hw_request, but with raw buffers instead. */ int hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, enum hid_class_request reqtype) { return __hid_hw_raw_request(hdev, reportnum, buf, len, rtype, reqtype, 0, false); } EXPORT_SYMBOL_GPL(hid_hw_raw_request); int __hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len, u64 source, bool from_bpf) { unsigned int max_buffer_size = HID_MAX_BUFFER_SIZE; int ret; if (hdev->ll_driver->max_buffer_size) max_buffer_size = hdev->ll_driver->max_buffer_size; if (len < 1 || len > max_buffer_size || !buf) return -EINVAL; ret = dispatch_hid_bpf_output_report(hdev, buf, len, source, from_bpf); if (ret) return ret; if (hdev->ll_driver->output_report) return hdev->ll_driver->output_report(hdev, buf, len); return -ENOSYS; } /** * hid_hw_output_report - send output report to device * * @hdev: hid device * @buf: raw data to transfer * @len: length of buf * * Return: count of data transferred, negative if error */ int hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len) { return __hid_hw_output_report(hdev, buf, len, 0, false); } EXPORT_SYMBOL_GPL(hid_hw_output_report); #ifdef CONFIG_PM int hid_driver_suspend(struct hid_device *hdev, pm_message_t state) { if (hdev->driver && hdev->driver->suspend) return hdev->driver->suspend(hdev, state); return 0; } EXPORT_SYMBOL_GPL(hid_driver_suspend); int hid_driver_reset_resume(struct hid_device *hdev) { if (hdev->driver && hdev->driver->reset_resume) return hdev->driver->reset_resume(hdev); return 0; } EXPORT_SYMBOL_GPL(hid_driver_reset_resume); int hid_driver_resume(struct hid_device *hdev) { if (hdev->driver && hdev->driver->resume) return hdev->driver->resume(hdev); return 0; } EXPORT_SYMBOL_GPL(hid_driver_resume); #endif /* CONFIG_PM */ struct hid_dynid { struct list_head list; struct hid_device_id id; }; /** * new_id_store - add a new HID device ID to this driver and re-probe devices * @drv: target device driver * @buf: buffer for scanning device ID data * @count: input size * * Adds a new dynamic hid device ID to this driver, * and causes the driver to probe for all devices again. */ static ssize_t new_id_store(struct device_driver *drv, const char *buf, size_t count) { struct hid_driver *hdrv = to_hid_driver(drv); struct hid_dynid *dynid; __u32 bus, vendor, product; unsigned long driver_data = 0; int ret; ret = sscanf(buf, "%x %x %x %lx", &bus, &vendor, &product, &driver_data); if (ret < 3) return -EINVAL; dynid = kzalloc(sizeof(*dynid), GFP_KERNEL); if (!dynid) return -ENOMEM; dynid->id.bus = bus; dynid->id.group = HID_GROUP_ANY; dynid->id.vendor = vendor; dynid->id.product = product; dynid->id.driver_data = driver_data; spin_lock(&hdrv->dyn_lock); list_add_tail(&dynid->list, &hdrv->dyn_list); spin_unlock(&hdrv->dyn_lock); ret = driver_attach(&hdrv->driver); return ret ? : count; } static DRIVER_ATTR_WO(new_id); static struct attribute *hid_drv_attrs[] = { &driver_attr_new_id.attr, NULL, }; ATTRIBUTE_GROUPS(hid_drv); static void hid_free_dynids(struct hid_driver *hdrv) { struct hid_dynid *dynid, *n; spin_lock(&hdrv->dyn_lock); list_for_each_entry_safe(dynid, n, &hdrv->dyn_list, list) { list_del(&dynid->list); kfree(dynid); } spin_unlock(&hdrv->dyn_lock); } const struct hid_device_id *hid_match_device(struct hid_device *hdev, struct hid_driver *hdrv) { struct hid_dynid *dynid; spin_lock(&hdrv->dyn_lock); list_for_each_entry(dynid, &hdrv->dyn_list, list) { if (hid_match_one_id(hdev, &dynid->id)) { spin_unlock(&hdrv->dyn_lock); return &dynid->id; } } spin_unlock(&hdrv->dyn_lock); return hid_match_id(hdev, hdrv->id_table); } EXPORT_SYMBOL_GPL(hid_match_device); static int hid_bus_match(struct device *dev, const struct device_driver *drv) { struct hid_driver *hdrv = to_hid_driver(drv); struct hid_device *hdev = to_hid_device(dev); return hid_match_device(hdev, hdrv) != NULL; } /** * hid_compare_device_paths - check if both devices share the same path * @hdev_a: hid device * @hdev_b: hid device * @separator: char to use as separator * * Check if two devices share the same path up to the last occurrence of * the separator char. Both paths must exist (i.e., zero-length paths * don't match). */ bool hid_compare_device_paths(struct hid_device *hdev_a, struct hid_device *hdev_b, char separator) { int n1 = strrchr(hdev_a->phys, separator) - hdev_a->phys; int n2 = strrchr(hdev_b->phys, separator) - hdev_b->phys; if (n1 != n2 || n1 <= 0 || n2 <= 0) return false; return !strncmp(hdev_a->phys, hdev_b->phys, n1); } EXPORT_SYMBOL_GPL(hid_compare_device_paths); static bool hid_check_device_match(struct hid_device *hdev, struct hid_driver *hdrv, const struct hid_device_id **id) { *id = hid_match_device(hdev, hdrv); if (!*id) return false; if (hdrv->match) return hdrv->match(hdev, hid_ignore_special_drivers); /* * hid-generic implements .match(), so we must be dealing with a * different HID driver here, and can simply check if * hid_ignore_special_drivers or HID_QUIRK_IGNORE_SPECIAL_DRIVER * are set or not. */ return !hid_ignore_special_drivers && !(hdev->quirks & HID_QUIRK_IGNORE_SPECIAL_DRIVER); } static void hid_set_group(struct hid_device *hdev) { int ret; if (hid_ignore_special_drivers) { hdev->group = HID_GROUP_GENERIC; } else if (!hdev->group && !(hdev->quirks & HID_QUIRK_HAVE_SPECIAL_DRIVER)) { ret = hid_scan_report(hdev); if (ret) hid_warn(hdev, "bad device descriptor (%d)\n", ret); } } static int __hid_device_probe(struct hid_device *hdev, struct hid_driver *hdrv) { const struct hid_device_id *id; int ret; if (!hdev->bpf_rsize) { /* we keep a reference to the currently scanned report descriptor */ const __u8 *original_rdesc = hdev->bpf_rdesc; if (!original_rdesc) original_rdesc = hdev->dev_rdesc; /* in case a bpf program gets detached, we need to free the old one */ hid_free_bpf_rdesc(hdev); /* keep this around so we know we called it once */ hdev->bpf_rsize = hdev->dev_rsize; /* call_hid_bpf_rdesc_fixup will always return a valid pointer */ hdev->bpf_rdesc = call_hid_bpf_rdesc_fixup(hdev, hdev->dev_rdesc, &hdev->bpf_rsize); /* the report descriptor changed, we need to re-scan it */ if (original_rdesc != hdev->bpf_rdesc) { hdev->group = 0; hid_set_group(hdev); } } if (!hid_check_device_match(hdev, hdrv, &id)) return -ENODEV; hdev->devres_group_id = devres_open_group(&hdev->dev, NULL, GFP_KERNEL); if (!hdev->devres_group_id) return -ENOMEM; /* reset the quirks that has been previously set */ hdev->quirks = hid_lookup_quirk(hdev); hdev->driver = hdrv; if (hdrv->probe) { ret = hdrv->probe(hdev, id); } else { /* default probe */ ret = hid_open_report(hdev); if (!ret) ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT); } /* * Note that we are not closing the devres group opened above so * even resources that were attached to the device after probe is * run are released when hid_device_remove() is executed. This is * needed as some drivers would allocate additional resources, * for example when updating firmware. */ if (ret) { devres_release_group(&hdev->dev, hdev->devres_group_id); hid_close_report(hdev); hdev->driver = NULL; } return ret; } static int hid_device_probe(struct device *dev) { struct hid_device *hdev = to_hid_device(dev); struct hid_driver *hdrv = to_hid_driver(dev->driver); int ret = 0; if (down_interruptible(&hdev->driver_input_lock)) return -EINTR; hdev->io_started = false; clear_bit(ffs(HID_STAT_REPROBED), &hdev->status); if (!hdev->driver) ret = __hid_device_probe(hdev, hdrv); if (!hdev->io_started) up(&hdev->driver_input_lock); return ret; } static void hid_device_remove(struct device *dev) { struct hid_device *hdev = to_hid_device(dev); struct hid_driver *hdrv; down(&hdev->driver_input_lock); hdev->io_started = false; hdrv = hdev->driver; if (hdrv) { if (hdrv->remove) hdrv->remove(hdev); else /* default remove */ hid_hw_stop(hdev); /* Release all devres resources allocated by the driver */ devres_release_group(&hdev->dev, hdev->devres_group_id); hid_close_report(hdev); hdev->driver = NULL; } if (!hdev->io_started) up(&hdev->driver_input_lock); } static ssize_t modalias_show(struct device *dev, struct device_attribute *a, char *buf) { struct hid_device *hdev = container_of(dev, struct hid_device, dev); return sysfs_emit(buf, "hid:b%04Xg%04Xv%08Xp%08X\n", hdev->bus, hdev->group, hdev->vendor, hdev->product); } static DEVICE_ATTR_RO(modalias); static struct attribute *hid_dev_attrs[] = { &dev_attr_modalias.attr, NULL, }; static const struct bin_attribute *hid_dev_bin_attrs[] = { &bin_attr_report_descriptor, NULL }; static const struct attribute_group hid_dev_group = { .attrs = hid_dev_attrs, .bin_attrs = hid_dev_bin_attrs, }; __ATTRIBUTE_GROUPS(hid_dev); static int hid_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct hid_device *hdev = to_hid_device(dev); if (add_uevent_var(env, "HID_ID=%04X:%08X:%08X", hdev->bus, hdev->vendor, hdev->product)) return -ENOMEM; if (add_uevent_var(env, "HID_NAME=%s", hdev->name)) return -ENOMEM; if (add_uevent_var(env, "HID_PHYS=%s", hdev->phys)) return -ENOMEM; if (add_uevent_var(env, "HID_UNIQ=%s", hdev->uniq)) return -ENOMEM; if (add_uevent_var(env, "MODALIAS=hid:b%04Xg%04Xv%08Xp%08X", hdev->bus, hdev->group, hdev->vendor, hdev->product)) return -ENOMEM; return 0; } const struct bus_type hid_bus_type = { .name = "hid", .dev_groups = hid_dev_groups, .drv_groups = hid_drv_groups, .match = hid_bus_match, .probe = hid_device_probe, .remove = hid_device_remove, .uevent = hid_uevent, }; EXPORT_SYMBOL(hid_bus_type); int hid_add_device(struct hid_device *hdev) { static atomic_t id = ATOMIC_INIT(0); int ret; if (WARN_ON(hdev->status & HID_STAT_ADDED)) return -EBUSY; hdev->quirks = hid_lookup_quirk(hdev); /* we need to kill them here, otherwise they will stay allocated to * wait for coming driver */ if (hid_ignore(hdev)) return -ENODEV; /* * Check for the mandatory transport channel. */ if (!hdev->ll_driver->raw_request) { hid_err(hdev, "transport driver missing .raw_request()\n"); return -EINVAL; } /* * Read the device report descriptor once and use as template * for the driver-specific modifications. */ ret = hdev->ll_driver->parse(hdev); if (ret) return ret; if (!hdev->dev_rdesc) return -ENODEV; /* * Scan generic devices for group information */ hid_set_group(hdev); hdev->id = atomic_inc_return(&id); /* XXX hack, any other cleaner solution after the driver core * is converted to allow more than 20 bytes as the device name? */ dev_set_name(&hdev->dev, "%04X:%04X:%04X.%04X", hdev->bus, hdev->vendor, hdev->product, hdev->id); hid_debug_register(hdev, dev_name(&hdev->dev)); ret = device_add(&hdev->dev); if (!ret) hdev->status |= HID_STAT_ADDED; else hid_debug_unregister(hdev); return ret; } EXPORT_SYMBOL_GPL(hid_add_device); /** * hid_allocate_device - allocate new hid device descriptor * * Allocate and initialize hid device, so that hid_destroy_device might be * used to free it. * * New hid_device pointer is returned on success, otherwise ERR_PTR encoded * error value. */ struct hid_device *hid_allocate_device(void) { struct hid_device *hdev; int ret = -ENOMEM; hdev = kzalloc(sizeof(*hdev), GFP_KERNEL); if (hdev == NULL) return ERR_PTR(ret); device_initialize(&hdev->dev); hdev->dev.release = hid_device_release; hdev->dev.bus = &hid_bus_type; device_enable_async_suspend(&hdev->dev); hid_close_report(hdev); init_waitqueue_head(&hdev->debug_wait); INIT_LIST_HEAD(&hdev->debug_list); spin_lock_init(&hdev->debug_list_lock); sema_init(&hdev->driver_input_lock, 1); mutex_init(&hdev->ll_open_lock); kref_init(&hdev->ref); ret = hid_bpf_device_init(hdev); if (ret) goto out_err; return hdev; out_err: hid_destroy_device(hdev); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(hid_allocate_device); static void hid_remove_device(struct hid_device *hdev) { if (hdev->status & HID_STAT_ADDED) { device_del(&hdev->dev); hid_debug_unregister(hdev); hdev->status &= ~HID_STAT_ADDED; } hid_free_bpf_rdesc(hdev); kfree(hdev->dev_rdesc); hdev->dev_rdesc = NULL; hdev->dev_rsize = 0; hdev->bpf_rsize = 0; } /** * hid_destroy_device - free previously allocated device * * @hdev: hid device * * If you allocate hid_device through hid_allocate_device, you should ever * free by this function. */ void hid_destroy_device(struct hid_device *hdev) { hid_bpf_destroy_device(hdev); hid_remove_device(hdev); put_device(&hdev->dev); } EXPORT_SYMBOL_GPL(hid_destroy_device); static int __hid_bus_reprobe_drivers(struct device *dev, void *data) { struct hid_driver *hdrv = data; struct hid_device *hdev = to_hid_device(dev); if (hdev->driver == hdrv && !hdrv->match(hdev, hid_ignore_special_drivers) && !test_and_set_bit(ffs(HID_STAT_REPROBED), &hdev->status)) return device_reprobe(dev); return 0; } static int __hid_bus_driver_added(struct device_driver *drv, void *data) { struct hid_driver *hdrv = to_hid_driver(drv); if (hdrv->match) { bus_for_each_dev(&hid_bus_type, NULL, hdrv, __hid_bus_reprobe_drivers); } return 0; } static int __bus_removed_driver(struct device_driver *drv, void *data) { return bus_rescan_devices(&hid_bus_type); } int __hid_register_driver(struct hid_driver *hdrv, struct module *owner, const char *mod_name) { int ret; hdrv->driver.name = hdrv->name; hdrv->driver.bus = &hid_bus_type; hdrv->driver.owner = owner; hdrv->driver.mod_name = mod_name; INIT_LIST_HEAD(&hdrv->dyn_list); spin_lock_init(&hdrv->dyn_lock); ret = driver_register(&hdrv->driver); if (ret == 0) bus_for_each_drv(&hid_bus_type, NULL, NULL, __hid_bus_driver_added); return ret; } EXPORT_SYMBOL_GPL(__hid_register_driver); void hid_unregister_driver(struct hid_driver *hdrv) { driver_unregister(&hdrv->driver); hid_free_dynids(hdrv); bus_for_each_drv(&hid_bus_type, NULL, hdrv, __bus_removed_driver); } EXPORT_SYMBOL_GPL(hid_unregister_driver); int hid_check_keys_pressed(struct hid_device *hid) { struct hid_input *hidinput; int i; if (!(hid->claimed & HID_CLAIMED_INPUT)) return 0; list_for_each_entry(hidinput, &hid->inputs, list) { for (i = 0; i < BITS_TO_LONGS(KEY_MAX); i++) if (hidinput->input->key[i]) return 1; } return 0; } EXPORT_SYMBOL_GPL(hid_check_keys_pressed); #ifdef CONFIG_HID_BPF static const struct hid_ops __hid_ops = { .hid_get_report = hid_get_report, .hid_hw_raw_request = __hid_hw_raw_request, .hid_hw_output_report = __hid_hw_output_report, .hid_input_report = __hid_input_report, .owner = THIS_MODULE, .bus_type = &hid_bus_type, }; #endif static int __init hid_init(void) { int ret; ret = bus_register(&hid_bus_type); if (ret) { pr_err("can't register hid bus\n"); goto err; } #ifdef CONFIG_HID_BPF hid_ops = &__hid_ops; #endif ret = hidraw_init(); if (ret) goto err_bus; hid_debug_init(); return 0; err_bus: bus_unregister(&hid_bus_type); err: return ret; } static void __exit hid_exit(void) { #ifdef CONFIG_HID_BPF hid_ops = NULL; #endif hid_debug_exit(); hidraw_exit(); bus_unregister(&hid_bus_type); hid_quirks_exit(HID_BUS_ANY); } module_init(hid_init); module_exit(hid_exit); MODULE_AUTHOR("Andreas Gal"); MODULE_AUTHOR("Vojtech Pavlik"); MODULE_AUTHOR("Jiri Kosina"); MODULE_DESCRIPTION("HID support for Linux"); MODULE_LICENSE("GPL");
3 3 3 3 3 3 3 3 3 3 2 30 19 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API for algorithms (i.e., low-level API). * * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/algapi.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/fips.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/workqueue.h> #include "internal.h" static LIST_HEAD(crypto_template_list); static inline void crypto_check_module_sig(struct module *mod) { if (fips_enabled && mod && !module_sig_ok(mod)) panic("Module %s signature verification failed in FIPS mode\n", module_name(mod)); } static int crypto_check_alg(struct crypto_alg *alg) { crypto_check_module_sig(alg->cra_module); if (!alg->cra_name[0] || !alg->cra_driver_name[0]) return -EINVAL; if (alg->cra_alignmask & (alg->cra_alignmask + 1)) return -EINVAL; /* General maximums for all algs. */ if (alg->cra_alignmask > MAX_ALGAPI_ALIGNMASK) return -EINVAL; if (alg->cra_blocksize > MAX_ALGAPI_BLOCKSIZE) return -EINVAL; /* Lower maximums for specific alg types. */ if (!alg->cra_type && (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) == CRYPTO_ALG_TYPE_CIPHER) { if (alg->cra_alignmask > MAX_CIPHER_ALIGNMASK) return -EINVAL; if (alg->cra_blocksize > MAX_CIPHER_BLOCKSIZE) return -EINVAL; } if (alg->cra_priority < 0) return -EINVAL; refcount_set(&alg->cra_refcnt, 1); return 0; } static void crypto_free_instance(struct crypto_instance *inst) { inst->alg.cra_type->free(inst); } static void crypto_destroy_instance_workfn(struct work_struct *w) { struct crypto_template *tmpl = container_of(w, struct crypto_template, free_work); struct crypto_instance *inst; struct hlist_node *n; HLIST_HEAD(list); down_write(&crypto_alg_sem); hlist_for_each_entry_safe(inst, n, &tmpl->dead, list) { if (refcount_read(&inst->alg.cra_refcnt) != -1) continue; hlist_del(&inst->list); hlist_add_head(&inst->list, &list); } up_write(&crypto_alg_sem); hlist_for_each_entry_safe(inst, n, &list, list) crypto_free_instance(inst); } static void crypto_destroy_instance(struct crypto_alg *alg) { struct crypto_instance *inst = container_of(alg, struct crypto_instance, alg); struct crypto_template *tmpl = inst->tmpl; refcount_set(&alg->cra_refcnt, -1); schedule_work(&tmpl->free_work); } /* * This function adds a spawn to the list secondary_spawns which * will be used at the end of crypto_remove_spawns to unregister * instances, unless the spawn happens to be one that is depended * on by the new algorithm (nalg in crypto_remove_spawns). * * This function is also responsible for resurrecting any algorithms * in the dependency chain of nalg by unsetting n->dead. */ static struct list_head *crypto_more_spawns(struct crypto_alg *alg, struct list_head *stack, struct list_head *top, struct list_head *secondary_spawns) { struct crypto_spawn *spawn, *n; spawn = list_first_entry_or_null(stack, struct crypto_spawn, list); if (!spawn) return NULL; n = list_prev_entry(spawn, list); list_move(&spawn->list, secondary_spawns); if (list_is_last(&n->list, stack)) return top; n = list_next_entry(n, list); if (!spawn->dead) n->dead = false; return &n->inst->alg.cra_users; } static void crypto_remove_instance(struct crypto_instance *inst, struct list_head *list) { struct crypto_template *tmpl = inst->tmpl; if (crypto_is_dead(&inst->alg)) return; inst->alg.cra_flags |= CRYPTO_ALG_DEAD; if (!tmpl) return; list_del_init(&inst->alg.cra_list); hlist_del(&inst->list); hlist_add_head(&inst->list, &tmpl->dead); BUG_ON(!list_empty(&inst->alg.cra_users)); crypto_alg_put(&inst->alg); } /* * Given an algorithm alg, remove all algorithms that depend on it * through spawns. If nalg is not null, then exempt any algorithms * that is depended on by nalg. This is useful when nalg itself * depends on alg. */ void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list, struct crypto_alg *nalg) { u32 new_type = (nalg ?: alg)->cra_flags; struct crypto_spawn *spawn, *n; LIST_HEAD(secondary_spawns); struct list_head *spawns; LIST_HEAD(stack); LIST_HEAD(top); spawns = &alg->cra_users; list_for_each_entry_safe(spawn, n, spawns, list) { if ((spawn->alg->cra_flags ^ new_type) & spawn->mask) continue; list_move(&spawn->list, &top); } /* * Perform a depth-first walk starting from alg through * the cra_users tree. The list stack records the path * from alg to the current spawn. */ spawns = &top; do { while (!list_empty(spawns)) { struct crypto_instance *inst; spawn = list_first_entry(spawns, struct crypto_spawn, list); inst = spawn->inst; list_move(&spawn->list, &stack); spawn->dead = !spawn->registered || &inst->alg != nalg; if (!spawn->registered) break; BUG_ON(&inst->alg == alg); if (&inst->alg == nalg) break; spawns = &inst->alg.cra_users; /* * Even if spawn->registered is true, the * instance itself may still be unregistered. * This is because it may have failed during * registration. Therefore we still need to * make the following test. * * We may encounter an unregistered instance here, since * an instance's spawns are set up prior to the instance * being registered. An unregistered instance will have * NULL ->cra_users.next, since ->cra_users isn't * properly initialized until registration. But an * unregistered instance cannot have any users, so treat * it the same as ->cra_users being empty. */ if (spawns->next == NULL) break; } } while ((spawns = crypto_more_spawns(alg, &stack, &top, &secondary_spawns))); /* * Remove all instances that are marked as dead. Also * complete the resurrection of the others by moving them * back to the cra_users list. */ list_for_each_entry_safe(spawn, n, &secondary_spawns, list) { if (!spawn->dead) list_move(&spawn->list, &spawn->alg->cra_users); else if (spawn->registered) crypto_remove_instance(spawn->inst, list); } } EXPORT_SYMBOL_GPL(crypto_remove_spawns); static void crypto_alg_finish_registration(struct crypto_alg *alg, struct list_head *algs_to_put) { struct crypto_alg *q; list_for_each_entry(q, &crypto_alg_list, cra_list) { if (q == alg) continue; if (crypto_is_moribund(q)) continue; if (crypto_is_larval(q)) continue; if (strcmp(alg->cra_name, q->cra_name)) continue; if (strcmp(alg->cra_driver_name, q->cra_driver_name) && q->cra_priority > alg->cra_priority) continue; crypto_remove_spawns(q, algs_to_put, alg); } crypto_notify(CRYPTO_MSG_ALG_LOADED, alg); } static struct crypto_larval *crypto_alloc_test_larval(struct crypto_alg *alg) { struct crypto_larval *larval; if (!IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) || (alg->cra_flags & CRYPTO_ALG_INTERNAL)) return NULL; /* No self-test needed */ larval = crypto_larval_alloc(alg->cra_name, alg->cra_flags | CRYPTO_ALG_TESTED, 0); if (IS_ERR(larval)) return larval; larval->adult = crypto_mod_get(alg); if (!larval->adult) { kfree(larval); return ERR_PTR(-ENOENT); } refcount_set(&larval->alg.cra_refcnt, 1); memcpy(larval->alg.cra_driver_name, alg->cra_driver_name, CRYPTO_MAX_ALG_NAME); larval->alg.cra_priority = alg->cra_priority; return larval; } static struct crypto_larval * __crypto_register_alg(struct crypto_alg *alg, struct list_head *algs_to_put) { struct crypto_alg *q; struct crypto_larval *larval; int ret = -EAGAIN; if (crypto_is_dead(alg)) goto err; INIT_LIST_HEAD(&alg->cra_users); ret = -EEXIST; list_for_each_entry(q, &crypto_alg_list, cra_list) { if (q == alg) goto err; if (crypto_is_moribund(q)) continue; if (crypto_is_larval(q)) { if (!strcmp(alg->cra_driver_name, q->cra_driver_name)) goto err; continue; } if (!strcmp(q->cra_driver_name, alg->cra_name) || !strcmp(q->cra_driver_name, alg->cra_driver_name) || !strcmp(q->cra_name, alg->cra_driver_name)) goto err; } larval = crypto_alloc_test_larval(alg); if (IS_ERR(larval)) goto out; list_add(&alg->cra_list, &crypto_alg_list); if (larval) { /* No cheating! */ alg->cra_flags &= ~CRYPTO_ALG_TESTED; list_add(&larval->alg.cra_list, &crypto_alg_list); } else { alg->cra_flags |= CRYPTO_ALG_TESTED; crypto_alg_finish_registration(alg, algs_to_put); } out: return larval; err: larval = ERR_PTR(ret); goto out; } void crypto_alg_tested(const char *name, int err) { struct crypto_larval *test; struct crypto_alg *alg; struct crypto_alg *q; LIST_HEAD(list); down_write(&crypto_alg_sem); list_for_each_entry(q, &crypto_alg_list, cra_list) { if (crypto_is_moribund(q) || !crypto_is_larval(q)) continue; test = (struct crypto_larval *)q; if (!strcmp(q->cra_driver_name, name)) goto found; } pr_err("alg: Unexpected test result for %s: %d\n", name, err); up_write(&crypto_alg_sem); return; found: q->cra_flags |= CRYPTO_ALG_DEAD; alg = test->adult; if (crypto_is_dead(alg)) goto complete; if (err == -ECANCELED) alg->cra_flags |= CRYPTO_ALG_FIPS_INTERNAL; else if (err) goto complete; else alg->cra_flags &= ~CRYPTO_ALG_FIPS_INTERNAL; alg->cra_flags |= CRYPTO_ALG_TESTED; crypto_alg_finish_registration(alg, &list); complete: list_del_init(&test->alg.cra_list); complete_all(&test->completion); up_write(&crypto_alg_sem); crypto_alg_put(&test->alg); crypto_remove_final(&list); } EXPORT_SYMBOL_GPL(crypto_alg_tested); void crypto_remove_final(struct list_head *list) { struct crypto_alg *alg; struct crypto_alg *n; list_for_each_entry_safe(alg, n, list, cra_list) { list_del_init(&alg->cra_list); crypto_alg_put(alg); } } EXPORT_SYMBOL_GPL(crypto_remove_final); static void crypto_free_alg(struct crypto_alg *alg) { unsigned int algsize = alg->cra_type->algsize; u8 *p = (u8 *)alg - algsize; crypto_destroy_alg(alg); kfree(p); } int crypto_register_alg(struct crypto_alg *alg) { struct crypto_larval *larval; bool test_started = false; LIST_HEAD(algs_to_put); int err; alg->cra_flags &= ~CRYPTO_ALG_DEAD; err = crypto_check_alg(alg); if (err) return err; if (alg->cra_flags & CRYPTO_ALG_DUP_FIRST && !WARN_ON_ONCE(alg->cra_destroy)) { unsigned int algsize = alg->cra_type->algsize; u8 *p = (u8 *)alg - algsize; p = kmemdup(p, algsize + sizeof(*alg), GFP_KERNEL); if (!p) return -ENOMEM; alg = (void *)(p + algsize); alg->cra_destroy = crypto_free_alg; } down_write(&crypto_alg_sem); larval = __crypto_register_alg(alg, &algs_to_put); if (!IS_ERR_OR_NULL(larval)) { test_started = crypto_boot_test_finished(); larval->test_started = test_started; } up_write(&crypto_alg_sem); if (IS_ERR(larval)) { crypto_alg_put(alg); return PTR_ERR(larval); } if (test_started) crypto_schedule_test(larval); else crypto_remove_final(&algs_to_put); return 0; } EXPORT_SYMBOL_GPL(crypto_register_alg); static int crypto_remove_alg(struct crypto_alg *alg, struct list_head *list) { if (unlikely(list_empty(&alg->cra_list))) return -ENOENT; alg->cra_flags |= CRYPTO_ALG_DEAD; list_del_init(&alg->cra_list); crypto_remove_spawns(alg, list, NULL); return 0; } void crypto_unregister_alg(struct crypto_alg *alg) { int ret; LIST_HEAD(list); down_write(&crypto_alg_sem); ret = crypto_remove_alg(alg, &list); up_write(&crypto_alg_sem); if (WARN(ret, "Algorithm %s is not registered", alg->cra_driver_name)) return; WARN_ON(!alg->cra_destroy && refcount_read(&alg->cra_refcnt) != 1); list_add(&alg->cra_list, &list); crypto_remove_final(&list); } EXPORT_SYMBOL_GPL(crypto_unregister_alg); int crypto_register_algs(struct crypto_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_alg(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_alg(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_algs); void crypto_unregister_algs(struct crypto_alg *algs, int count) { int i; for (i = 0; i < count; i++) crypto_unregister_alg(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_algs); int crypto_register_template(struct crypto_template *tmpl) { struct crypto_template *q; int err = -EEXIST; INIT_WORK(&tmpl->free_work, crypto_destroy_instance_workfn); down_write(&crypto_alg_sem); crypto_check_module_sig(tmpl->module); list_for_each_entry(q, &crypto_template_list, list) { if (q == tmpl) goto out; } list_add(&tmpl->list, &crypto_template_list); err = 0; out: up_write(&crypto_alg_sem); return err; } EXPORT_SYMBOL_GPL(crypto_register_template); int crypto_register_templates(struct crypto_template *tmpls, int count) { int i, err; for (i = 0; i < count; i++) { err = crypto_register_template(&tmpls[i]); if (err) goto out; } return 0; out: for (--i; i >= 0; --i) crypto_unregister_template(&tmpls[i]); return err; } EXPORT_SYMBOL_GPL(crypto_register_templates); void crypto_unregister_template(struct crypto_template *tmpl) { struct crypto_instance *inst; struct hlist_node *n; struct hlist_head *list; LIST_HEAD(users); down_write(&crypto_alg_sem); BUG_ON(list_empty(&tmpl->list)); list_del_init(&tmpl->list); list = &tmpl->instances; hlist_for_each_entry(inst, list, list) { int err = crypto_remove_alg(&inst->alg, &users); BUG_ON(err); } up_write(&crypto_alg_sem); hlist_for_each_entry_safe(inst, n, list, list) { BUG_ON(refcount_read(&inst->alg.cra_refcnt) != 1); crypto_free_instance(inst); } crypto_remove_final(&users); flush_work(&tmpl->free_work); } EXPORT_SYMBOL_GPL(crypto_unregister_template); void crypto_unregister_templates(struct crypto_template *tmpls, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_template(&tmpls[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_templates); static struct crypto_template *__crypto_lookup_template(const char *name) { struct crypto_template *q, *tmpl = NULL; down_read(&crypto_alg_sem); list_for_each_entry(q, &crypto_template_list, list) { if (strcmp(q->name, name)) continue; if (unlikely(!crypto_tmpl_get(q))) continue; tmpl = q; break; } up_read(&crypto_alg_sem); return tmpl; } struct crypto_template *crypto_lookup_template(const char *name) { return try_then_request_module(__crypto_lookup_template(name), "crypto-%s", name); } EXPORT_SYMBOL_GPL(crypto_lookup_template); int crypto_register_instance(struct crypto_template *tmpl, struct crypto_instance *inst) { struct crypto_larval *larval; struct crypto_spawn *spawn; u32 fips_internal = 0; LIST_HEAD(algs_to_put); int err; err = crypto_check_alg(&inst->alg); if (err) return err; inst->alg.cra_module = tmpl->module; inst->alg.cra_flags |= CRYPTO_ALG_INSTANCE; inst->alg.cra_destroy = crypto_destroy_instance; down_write(&crypto_alg_sem); larval = ERR_PTR(-EAGAIN); for (spawn = inst->spawns; spawn;) { struct crypto_spawn *next; if (spawn->dead) goto unlock; next = spawn->next; spawn->inst = inst; spawn->registered = true; fips_internal |= spawn->alg->cra_flags; crypto_mod_put(spawn->alg); spawn = next; } inst->alg.cra_flags |= (fips_internal & CRYPTO_ALG_FIPS_INTERNAL); larval = __crypto_register_alg(&inst->alg, &algs_to_put); if (IS_ERR(larval)) goto unlock; else if (larval) larval->test_started = true; hlist_add_head(&inst->list, &tmpl->instances); inst->tmpl = tmpl; unlock: up_write(&crypto_alg_sem); if (IS_ERR(larval)) return PTR_ERR(larval); if (larval) crypto_schedule_test(larval); else crypto_remove_final(&algs_to_put); return 0; } EXPORT_SYMBOL_GPL(crypto_register_instance); void crypto_unregister_instance(struct crypto_instance *inst) { LIST_HEAD(list); down_write(&crypto_alg_sem); crypto_remove_spawns(&inst->alg, &list, NULL); crypto_remove_instance(inst, &list); up_write(&crypto_alg_sem); crypto_remove_final(&list); } EXPORT_SYMBOL_GPL(crypto_unregister_instance); int crypto_grab_spawn(struct crypto_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { struct crypto_alg *alg; int err = -EAGAIN; if (WARN_ON_ONCE(inst == NULL)) return -EINVAL; /* Allow the result of crypto_attr_alg_name() to be passed directly */ if (IS_ERR(name)) return PTR_ERR(name); alg = crypto_find_alg(name, spawn->frontend, type | CRYPTO_ALG_FIPS_INTERNAL, mask); if (IS_ERR(alg)) return PTR_ERR(alg); down_write(&crypto_alg_sem); if (!crypto_is_moribund(alg)) { list_add(&spawn->list, &alg->cra_users); spawn->alg = alg; spawn->mask = mask; spawn->next = inst->spawns; inst->spawns = spawn; inst->alg.cra_flags |= (alg->cra_flags & CRYPTO_ALG_INHERITED_FLAGS); err = 0; } up_write(&crypto_alg_sem); if (err) crypto_mod_put(alg); return err; } EXPORT_SYMBOL_GPL(crypto_grab_spawn); void crypto_drop_spawn(struct crypto_spawn *spawn) { if (!spawn->alg) /* not yet initialized? */ return; down_write(&crypto_alg_sem); if (!spawn->dead) list_del(&spawn->list); up_write(&crypto_alg_sem); if (!spawn->registered) crypto_mod_put(spawn->alg); } EXPORT_SYMBOL_GPL(crypto_drop_spawn); static struct crypto_alg *crypto_spawn_alg(struct crypto_spawn *spawn) { struct crypto_alg *alg = ERR_PTR(-EAGAIN); struct crypto_alg *target; bool shoot = false; down_read(&crypto_alg_sem); if (!spawn->dead) { alg = spawn->alg; if (!crypto_mod_get(alg)) { target = crypto_alg_get(alg); shoot = true; alg = ERR_PTR(-EAGAIN); } } up_read(&crypto_alg_sem); if (shoot) { crypto_shoot_alg(target); crypto_alg_put(target); } return alg; } struct crypto_tfm *crypto_spawn_tfm(struct crypto_spawn *spawn, u32 type, u32 mask) { struct crypto_alg *alg; struct crypto_tfm *tfm; alg = crypto_spawn_alg(spawn); if (IS_ERR(alg)) return ERR_CAST(alg); tfm = ERR_PTR(-EINVAL); if (unlikely((alg->cra_flags ^ type) & mask)) goto out_put_alg; tfm = __crypto_alloc_tfm(alg, type, mask); if (IS_ERR(tfm)) goto out_put_alg; return tfm; out_put_alg: crypto_mod_put(alg); return tfm; } EXPORT_SYMBOL_GPL(crypto_spawn_tfm); void *crypto_spawn_tfm2(struct crypto_spawn *spawn) { struct crypto_alg *alg; struct crypto_tfm *tfm; alg = crypto_spawn_alg(spawn); if (IS_ERR(alg)) return ERR_CAST(alg); tfm = crypto_create_tfm(alg, spawn->frontend); if (IS_ERR(tfm)) goto out_put_alg; return tfm; out_put_alg: crypto_mod_put(alg); return tfm; } EXPORT_SYMBOL_GPL(crypto_spawn_tfm2); int crypto_register_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&crypto_chain, nb); } EXPORT_SYMBOL_GPL(crypto_register_notifier); int crypto_unregister_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&crypto_chain, nb); } EXPORT_SYMBOL_GPL(crypto_unregister_notifier); struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb) { struct rtattr *rta = tb[0]; struct crypto_attr_type *algt; if (!rta) return ERR_PTR(-ENOENT); if (RTA_PAYLOAD(rta) < sizeof(*algt)) return ERR_PTR(-EINVAL); if (rta->rta_type != CRYPTOA_TYPE) return ERR_PTR(-EINVAL); algt = RTA_DATA(rta); return algt; } EXPORT_SYMBOL_GPL(crypto_get_attr_type); /** * crypto_check_attr_type() - check algorithm type and compute inherited mask * @tb: the template parameters * @type: the algorithm type the template would be instantiated as * @mask_ret: (output) the mask that should be passed to crypto_grab_*() * to restrict the flags of any inner algorithms * * Validate that the algorithm type the user requested is compatible with the * one the template would actually be instantiated as. E.g., if the user is * doing crypto_alloc_shash("cbc(aes)", ...), this would return an error because * the "cbc" template creates an "skcipher" algorithm, not an "shash" algorithm. * * Also compute the mask to use to restrict the flags of any inner algorithms. * * Return: 0 on success; -errno on failure */ int crypto_check_attr_type(struct rtattr **tb, u32 type, u32 *mask_ret) { struct crypto_attr_type *algt; algt = crypto_get_attr_type(tb); if (IS_ERR(algt)) return PTR_ERR(algt); if ((algt->type ^ type) & algt->mask) return -EINVAL; *mask_ret = crypto_algt_inherited_mask(algt); return 0; } EXPORT_SYMBOL_GPL(crypto_check_attr_type); const char *crypto_attr_alg_name(struct rtattr *rta) { struct crypto_attr_alg *alga; if (!rta) return ERR_PTR(-ENOENT); if (RTA_PAYLOAD(rta) < sizeof(*alga)) return ERR_PTR(-EINVAL); if (rta->rta_type != CRYPTOA_ALG) return ERR_PTR(-EINVAL); alga = RTA_DATA(rta); alga->name[CRYPTO_MAX_ALG_NAME - 1] = 0; return alga->name; } EXPORT_SYMBOL_GPL(crypto_attr_alg_name); int __crypto_inst_setname(struct crypto_instance *inst, const char *name, const char *driver, struct crypto_alg *alg) { if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", name, alg->cra_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", driver, alg->cra_driver_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; return 0; } EXPORT_SYMBOL_GPL(__crypto_inst_setname); void crypto_init_queue(struct crypto_queue *queue, unsigned int max_qlen) { INIT_LIST_HEAD(&queue->list); queue->backlog = &queue->list; queue->qlen = 0; queue->max_qlen = max_qlen; } EXPORT_SYMBOL_GPL(crypto_init_queue); int crypto_enqueue_request(struct crypto_queue *queue, struct crypto_async_request *request) { int err = -EINPROGRESS; if (unlikely(queue->qlen >= queue->max_qlen)) { if (!(request->flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) { err = -ENOSPC; goto out; } err = -EBUSY; if (queue->backlog == &queue->list) queue->backlog = &request->list; } queue->qlen++; list_add_tail(&request->list, &queue->list); out: return err; } EXPORT_SYMBOL_GPL(crypto_enqueue_request); void crypto_enqueue_request_head(struct crypto_queue *queue, struct crypto_async_request *request) { if (unlikely(queue->qlen >= queue->max_qlen)) queue->backlog = queue->backlog->prev; queue->qlen++; list_add(&request->list, &queue->list); } EXPORT_SYMBOL_GPL(crypto_enqueue_request_head); struct crypto_async_request *crypto_dequeue_request(struct crypto_queue *queue) { struct list_head *request; if (unlikely(!queue->qlen)) return NULL; queue->qlen--; if (queue->backlog != &queue->list) queue->backlog = queue->backlog->next; request = queue->list.next; list_del_init(request); return list_entry(request, struct crypto_async_request, list); } EXPORT_SYMBOL_GPL(crypto_dequeue_request); static inline void crypto_inc_byte(u8 *a, unsigned int size) { u8 *b = (a + size); u8 c; for (; size; size--) { c = *--b + 1; *b = c; if (c) break; } } void crypto_inc(u8 *a, unsigned int size) { __be32 *b = (__be32 *)(a + size); u32 c; if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || IS_ALIGNED((unsigned long)b, __alignof__(*b))) for (; size >= 4; size -= 4) { c = be32_to_cpu(*--b) + 1; *b = cpu_to_be32(c); if (likely(c)) return; } crypto_inc_byte(a, size); } EXPORT_SYMBOL_GPL(crypto_inc); unsigned int crypto_alg_extsize(struct crypto_alg *alg) { return alg->cra_ctxsize + (alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1)); } EXPORT_SYMBOL_GPL(crypto_alg_extsize); int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, u32 type, u32 mask) { int ret = 0; struct crypto_alg *alg = crypto_find_alg(name, frontend, type, mask); if (!IS_ERR(alg)) { crypto_mod_put(alg); ret = 1; } return ret; } EXPORT_SYMBOL_GPL(crypto_type_has_alg); static void __init crypto_start_tests(void) { if (!IS_BUILTIN(CONFIG_CRYPTO_ALGAPI)) return; if (!IS_ENABLED(CONFIG_CRYPTO_SELFTESTS)) return; set_crypto_boot_test_finished(); for (;;) { struct crypto_larval *larval = NULL; struct crypto_alg *q; down_write(&crypto_alg_sem); list_for_each_entry(q, &crypto_alg_list, cra_list) { struct crypto_larval *l; if (!crypto_is_larval(q)) continue; l = (void *)q; if (!crypto_is_test_larval(l)) continue; if (l->test_started) continue; l->test_started = true; larval = l; break; } up_write(&crypto_alg_sem); if (!larval) break; crypto_schedule_test(larval); } } static int __init crypto_algapi_init(void) { crypto_init_proc(); crypto_start_tests(); return 0; } static void __exit crypto_algapi_exit(void) { crypto_exit_proc(); } /* * We run this at late_initcall so that all the built-in algorithms * have had a chance to register themselves first. */ late_initcall(crypto_algapi_init); module_exit(crypto_algapi_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Cryptographic algorithms API"); MODULE_SOFTDEP("pre: cryptomgr");
4 6 6 1 11 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ #ifndef __IOMMUFD_PRIVATE_H #define __IOMMUFD_PRIVATE_H #include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/iova_bitmap.h> #include <linux/maple_tree.h> #include <linux/rwsem.h> #include <linux/uaccess.h> #include <linux/xarray.h> #include <uapi/linux/iommufd.h> #include "../iommu-priv.h" struct iommu_domain; struct iommu_group; struct iommu_option; struct iommufd_device; struct dma_buf_attachment; struct dma_buf_phys_vec; struct iommufd_sw_msi_map { struct list_head sw_msi_item; phys_addr_t sw_msi_start; phys_addr_t msi_addr; unsigned int pgoff; unsigned int id; }; /* Bitmap of struct iommufd_sw_msi_map::id */ struct iommufd_sw_msi_maps { DECLARE_BITMAP(bitmap, 64); }; #ifdef CONFIG_IRQ_MSI_IOMMU int iommufd_sw_msi_install(struct iommufd_ctx *ictx, struct iommufd_hwpt_paging *hwpt_paging, struct iommufd_sw_msi_map *msi_map); #endif struct iommufd_ctx { struct file *file; struct xarray objects; struct xarray groups; wait_queue_head_t destroy_wait; struct rw_semaphore ioas_creation_lock; struct maple_tree mt_mmap; struct mutex sw_msi_lock; struct list_head sw_msi_list; unsigned int sw_msi_id; u8 account_mode; /* Compatibility with VFIO no iommu */ u8 no_iommu_mode; struct iommufd_ioas *vfio_ioas; }; /* Entry for iommufd_ctx::mt_mmap */ struct iommufd_mmap { struct iommufd_object *owner; /* Page-shifted start position in mt_mmap to validate vma->vm_pgoff */ unsigned long vm_pgoff; /* Physical range for io_remap_pfn_range() */ phys_addr_t mmio_addr; size_t length; }; /* * The IOVA to PFN map. The map automatically copies the PFNs into multiple * domains and permits sharing of PFNs between io_pagetable instances. This * supports both a design where IOAS's are 1:1 with a domain (eg because the * domain is HW customized), or where the IOAS is 1:N with multiple generic * domains. The io_pagetable holds an interval tree of iopt_areas which point * to shared iopt_pages which hold the pfns mapped to the page table. * * The locking order is domains_rwsem -> iova_rwsem -> pages::mutex */ struct io_pagetable { struct rw_semaphore domains_rwsem; struct xarray domains; struct xarray access_list; unsigned int next_domain_id; struct rw_semaphore iova_rwsem; struct rb_root_cached area_itree; /* IOVA that cannot become reserved, struct iopt_allowed */ struct rb_root_cached allowed_itree; /* IOVA that cannot be allocated, struct iopt_reserved */ struct rb_root_cached reserved_itree; u8 disable_large_pages; unsigned long iova_alignment; }; void iopt_init_table(struct io_pagetable *iopt); void iopt_destroy_table(struct io_pagetable *iopt); int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, unsigned long length, struct list_head *pages_list); void iopt_free_pages_list(struct list_head *pages_list); enum { IOPT_ALLOC_IOVA = 1 << 0, }; int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long *iova, void __user *uptr, unsigned long length, int iommu_prot, unsigned int flags); int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long *iova, int fd, unsigned long start, unsigned long length, int iommu_prot, unsigned int flags); int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, unsigned long length, unsigned long *dst_iova, int iommu_prot, unsigned int flags); int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, unsigned long length, unsigned long *unmapped); int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped); int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, struct iommu_domain *domain, unsigned long flags, struct iommu_hwpt_get_dirty_bitmap *bitmap); int iopt_set_dirty_tracking(struct io_pagetable *iopt, struct iommu_domain *domain, bool enable); void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, unsigned long length); int iopt_table_add_domain(struct io_pagetable *iopt, struct iommu_domain *domain); void iopt_table_remove_domain(struct io_pagetable *iopt, struct iommu_domain *domain); int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, struct device *dev, phys_addr_t *sw_msi_start); int iopt_set_allow_iova(struct io_pagetable *iopt, struct rb_root_cached *allowed_iova); int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, unsigned long last, void *owner); void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner); int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, size_t num_iovas); void iopt_enable_large_pages(struct io_pagetable *iopt); int iopt_disable_large_pages(struct io_pagetable *iopt); struct iommufd_ucmd { struct iommufd_ctx *ictx; void __user *ubuffer; u32 user_size; void *cmd; struct iommufd_object *new_obj; }; int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, unsigned long arg); /* Copy the response in ucmd->cmd back to userspace. */ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, size_t cmd_len) { if (copy_to_user(ucmd->ubuffer, ucmd->cmd, min_t(size_t, ucmd->user_size, cmd_len))) return -EFAULT; return 0; } static inline bool iommufd_lock_obj(struct iommufd_object *obj) { if (!refcount_inc_not_zero(&obj->users)) return false; if (!refcount_inc_not_zero(&obj->wait_cnt)) { /* * If the caller doesn't already have a ref on obj this must be * called under the xa_lock. Otherwise the caller is holding a * ref on users. Thus it cannot be one before this decrement. */ refcount_dec(&obj->users); return false; } return true; } struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, enum iommufd_object_type type); static inline void iommufd_put_object(struct iommufd_ctx *ictx, struct iommufd_object *obj) { /* * Users first, then wait_cnt so that REMOVE_WAIT never sees a spurious * !0 users with a 0 wait_cnt. */ refcount_dec(&obj->users); if (refcount_dec_and_test(&obj->wait_cnt)) wake_up_interruptible_all(&ictx->destroy_wait); } void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj); void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj); void iommufd_object_finalize(struct iommufd_ctx *ictx, struct iommufd_object *obj); enum { REMOVE_WAIT = BIT(0), REMOVE_OBJ_TOMBSTONE = BIT(1), }; int iommufd_object_remove(struct iommufd_ctx *ictx, struct iommufd_object *to_destroy, u32 id, unsigned int flags); /* * The caller holds a users refcount and wants to destroy the object. At this * point the caller has no wait_cnt reference and at least the xarray will be * holding one. */ static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx, struct iommufd_object *obj) { int ret; ret = iommufd_object_remove(ictx, obj, obj->id, REMOVE_WAIT); /* * If there is a bug and we couldn't destroy the object then we did put * back the caller's users refcount and will eventually try to free it * again during close. */ WARN_ON(ret); } /* * Similar to iommufd_object_destroy_user(), except that the object ID is left * reserved/tombstoned. */ static inline void iommufd_object_tombstone_user(struct iommufd_ctx *ictx, struct iommufd_object *obj) { int ret; ret = iommufd_object_remove(ictx, obj, obj->id, REMOVE_WAIT | REMOVE_OBJ_TOMBSTONE); /* * If there is a bug and we couldn't destroy the object then we did put * back the caller's users refcount and will eventually try to free it * again during close. */ WARN_ON(ret); } /* * The HWPT allocated by autodomains is used in possibly many devices and * is automatically destroyed when its refcount reaches zero. * * If userspace uses the HWPT manually, even for a short term, then it will * disrupt this refcounting and the auto-free in the kernel will not work. * Userspace that tries to use the automatically allocated HWPT must be careful * to ensure that it is consistently destroyed, eg by not racing accesses * and by not attaching an automatic HWPT to a device manually. */ static inline void iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj) { iommufd_object_remove(ictx, obj, obj->id, 0); } /* * Callers of these normal object allocators must call iommufd_object_finalize() * to finalize the object, or call iommufd_object_abort_and_destroy() to revert * the allocation. */ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, enum iommufd_object_type type); #define __iommufd_object_alloc(ictx, ptr, type, obj) \ container_of(_iommufd_object_alloc( \ ictx, \ sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \ offsetof(typeof(*(ptr)), \ obj) != 0), \ type), \ typeof(*(ptr)), obj) #define iommufd_object_alloc(ictx, ptr, type) \ __iommufd_object_alloc(ictx, ptr, type, obj) /* * Callers of these _ucmd allocators should not call iommufd_object_finalize() * or iommufd_object_abort_and_destroy(), as the core automatically does that. */ struct iommufd_object * _iommufd_object_alloc_ucmd(struct iommufd_ucmd *ucmd, size_t size, enum iommufd_object_type type); #define __iommufd_object_alloc_ucmd(ucmd, ptr, type, obj) \ container_of(_iommufd_object_alloc_ucmd( \ ucmd, \ sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \ offsetof(typeof(*(ptr)), \ obj) != 0), \ type), \ typeof(*(ptr)), obj) #define iommufd_object_alloc_ucmd(ucmd, ptr, type) \ __iommufd_object_alloc_ucmd(ucmd, ptr, type, obj) /* * The IO Address Space (IOAS) pagetable is a virtual page table backed by the * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The * mapping is copied into all of the associated domains and made available to * in-kernel users. * * Every iommu_domain that is created is wrapped in a iommufd_hw_pagetable * object. When we go to attach a device to an IOAS we need to get an * iommu_domain and wrapping iommufd_hw_pagetable for it. * * An iommu_domain & iommfd_hw_pagetable will be automatically selected * for a device based on the hwpt_list. If no suitable iommu_domain * is found a new iommu_domain will be created. */ struct iommufd_ioas { struct iommufd_object obj; struct io_pagetable iopt; struct mutex mutex; struct list_head hwpt_list; }; static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ctx *ictx, u32 id) { return container_of(iommufd_get_object(ictx, id, IOMMUFD_OBJ_IOAS), struct iommufd_ioas, obj); } struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx); int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_ioas_destroy(struct iommufd_object *obj); int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd); int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd); int iommufd_ioas_map(struct iommufd_ucmd *ucmd); int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd); int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd); int iommufd_ioas_copy(struct iommufd_ucmd *ucmd); int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); int iommufd_ioas_option(struct iommufd_ucmd *ucmd); int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx); int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd); int iommufd_check_iova_range(struct io_pagetable *iopt, struct iommu_hwpt_get_dirty_bitmap *bitmap); /* * A HW pagetable is called an iommu_domain inside the kernel. This user object * allows directly creating and inspecting the domains. Domains that have kernel * owned page tables will be associated with an iommufd_ioas that provides the * IOVA to PFN map. */ struct iommufd_hw_pagetable { struct iommufd_object obj; struct iommu_domain *domain; struct iommufd_fault *fault; bool pasid_compat : 1; }; struct iommufd_hwpt_paging { struct iommufd_hw_pagetable common; struct iommufd_ioas *ioas; bool auto_domain : 1; bool enforce_cache_coherency : 1; bool nest_parent : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; struct iommufd_sw_msi_maps present_sw_msi; }; struct iommufd_hwpt_nested { struct iommufd_hw_pagetable common; struct iommufd_hwpt_paging *parent; struct iommufd_viommu *viommu; }; static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt) { return hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING; } static inline struct iommufd_hwpt_paging * to_hwpt_paging(struct iommufd_hw_pagetable *hwpt) { return container_of(hwpt, struct iommufd_hwpt_paging, common); } static inline struct iommufd_hwpt_nested * to_hwpt_nested(struct iommufd_hw_pagetable *hwpt) { return container_of(hwpt, struct iommufd_hwpt_nested, common); } static inline struct iommufd_hwpt_paging * find_hwpt_paging(struct iommufd_hw_pagetable *hwpt) { switch (hwpt->obj.type) { case IOMMUFD_OBJ_HWPT_PAGING: return to_hwpt_paging(hwpt); case IOMMUFD_OBJ_HWPT_NESTED: return to_hwpt_nested(hwpt)->parent; default: return NULL; } } static inline struct iommufd_hwpt_paging * iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_HWPT_PAGING), struct iommufd_hwpt_paging, common.obj); } static inline struct iommufd_hw_pagetable * iommufd_get_hwpt_nested(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_HWPT_NESTED), struct iommufd_hw_pagetable, obj); } int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd); int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); struct iommufd_hwpt_paging * iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, ioasid_t pasid, u32 flags, bool immediate_attach, const struct iommu_user_data *user_data); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev, ioasid_t pasid); struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid); void iommufd_hwpt_paging_destroy(struct iommufd_object *obj); void iommufd_hwpt_paging_abort(struct iommufd_object *obj); void iommufd_hwpt_nested_destroy(struct iommufd_object *obj); void iommufd_hwpt_nested_abort(struct iommufd_object *obj); int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd); int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd); static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt) { if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) { struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt); if (hwpt_paging->auto_domain) { lockdep_assert_not_held(&hwpt_paging->ioas->mutex); iommufd_object_put_and_try_destroy(ictx, &hwpt->obj); return; } } refcount_dec(&hwpt->obj.users); } struct iommufd_attach; struct iommufd_group { struct kref ref; struct mutex lock; struct iommufd_ctx *ictx; struct iommu_group *group; struct xarray pasid_attach; struct iommufd_sw_msi_maps required_sw_msi; phys_addr_t sw_msi_start; }; /* * A iommufd_device object represents the binding relationship between a * consuming driver and the iommufd. These objects are created/destroyed by * external drivers, not by userspace. */ struct iommufd_device { struct iommufd_object obj; struct iommufd_ctx *ictx; struct iommufd_group *igroup; struct list_head group_item; /* always the physical device */ struct device *dev; bool enforce_cache_coherency; struct iommufd_vdevice *vdev; bool destroying; }; static inline struct iommufd_device * iommufd_get_device(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_DEVICE), struct iommufd_device, obj); } void iommufd_device_pre_destroy(struct iommufd_object *obj); void iommufd_device_destroy(struct iommufd_object *obj); int iommufd_get_hw_info(struct iommufd_ucmd *ucmd); struct device *iommufd_global_device(void); struct iommufd_access { struct iommufd_object obj; struct iommufd_ctx *ictx; struct iommufd_ioas *ioas; struct iommufd_ioas *ioas_unpin; struct mutex ioas_lock; const struct iommufd_access_ops *ops; void *data; unsigned long iova_alignment; u32 iopt_access_list_id; }; int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access); void iopt_remove_access(struct io_pagetable *iopt, struct iommufd_access *access, u32 iopt_access_list_id); void iommufd_access_destroy_object(struct iommufd_object *obj); /* iommufd_access for internal use */ static inline bool iommufd_access_is_internal(struct iommufd_access *access) { return !access->ictx; } struct iommufd_access *iommufd_access_create_internal(struct iommufd_ctx *ictx); static inline void iommufd_access_destroy_internal(struct iommufd_ctx *ictx, struct iommufd_access *access) { iommufd_object_destroy_user(ictx, &access->obj); } int iommufd_access_attach_internal(struct iommufd_access *access, struct iommufd_ioas *ioas); static inline void iommufd_access_detach_internal(struct iommufd_access *access) { iommufd_access_detach(access); } struct iommufd_eventq { struct iommufd_object obj; struct iommufd_ctx *ictx; struct file *filep; spinlock_t lock; /* protects the deliver list */ struct list_head deliver; struct wait_queue_head wait_queue; }; struct iommufd_attach_handle { struct iommu_attach_handle handle; struct iommufd_device *idev; }; /* Convert an iommu attach handle to iommufd handle. */ #define to_iommufd_handle(hdl) container_of(hdl, struct iommufd_attach_handle, handle) /* * An iommufd_fault object represents an interface to deliver I/O page faults * to the user space. These objects are created/destroyed by the user space and * associated with hardware page table objects during page-table allocation. */ struct iommufd_fault { struct iommufd_eventq common; struct mutex mutex; /* serializes response flows */ struct xarray response; }; static inline struct iommufd_fault * eventq_to_fault(struct iommufd_eventq *eventq) { return container_of(eventq, struct iommufd_fault, common); } static inline struct iommufd_fault * iommufd_get_fault(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_FAULT), struct iommufd_fault, common.obj); } int iommufd_fault_alloc(struct iommufd_ucmd *ucmd); void iommufd_fault_destroy(struct iommufd_object *obj); int iommufd_fault_iopf_handler(struct iopf_group *group); void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, struct iommufd_attach_handle *handle); /* An iommufd_vevent represents a vIOMMU event in an iommufd_veventq */ struct iommufd_vevent { struct iommufd_vevent_header header; struct list_head node; /* for iommufd_eventq::deliver */ ssize_t data_len; u64 event_data[] __counted_by(data_len); }; #define vevent_for_lost_events_header(vevent) \ (vevent->header.flags & IOMMU_VEVENTQ_FLAG_LOST_EVENTS) /* * An iommufd_veventq object represents an interface to deliver vIOMMU events to * the user space. It is created/destroyed by the user space and associated with * a vIOMMU object during the allocations. */ struct iommufd_veventq { struct iommufd_eventq common; struct iommufd_viommu *viommu; struct list_head node; /* for iommufd_viommu::veventqs */ enum iommu_veventq_type type; unsigned int depth; /* Use common.lock for protection */ u32 num_events; u32 sequence; /* Must be last as it ends in a flexible-array member. */ struct iommufd_vevent lost_events_header; }; static inline struct iommufd_veventq * eventq_to_veventq(struct iommufd_eventq *eventq) { return container_of(eventq, struct iommufd_veventq, common); } static inline struct iommufd_veventq * iommufd_get_veventq(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_VEVENTQ), struct iommufd_veventq, common.obj); } int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd); void iommufd_veventq_destroy(struct iommufd_object *obj); void iommufd_veventq_abort(struct iommufd_object *obj); static inline void iommufd_vevent_handler(struct iommufd_veventq *veventq, struct iommufd_vevent *vevent) { struct iommufd_eventq *eventq = &veventq->common; lockdep_assert_held(&eventq->lock); /* * Remove the lost_events_header and add the new node at the same time. * Note the new node can be lost_events_header, for a sequence update. */ if (list_is_last(&veventq->lost_events_header.node, &eventq->deliver)) list_del(&veventq->lost_events_header.node); list_add_tail(&vevent->node, &eventq->deliver); vevent->header.sequence = veventq->sequence; veventq->sequence = (veventq->sequence + 1) & INT_MAX; wake_up_interruptible(&eventq->wait_queue); } static inline struct iommufd_viommu * iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_VIOMMU), struct iommufd_viommu, obj); } static inline struct iommufd_veventq * iommufd_viommu_find_veventq(struct iommufd_viommu *viommu, enum iommu_veventq_type type) { struct iommufd_veventq *veventq, *next; lockdep_assert_held(&viommu->veventqs_rwsem); list_for_each_entry_safe(veventq, next, &viommu->veventqs, node) { if (veventq->type == type) return veventq; } return NULL; } int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_viommu_destroy(struct iommufd_object *obj); int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_vdevice_destroy(struct iommufd_object *obj); void iommufd_vdevice_abort(struct iommufd_object *obj); int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_hw_queue_destroy(struct iommufd_object *obj); static inline struct iommufd_vdevice * iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id) { return container_of(iommufd_get_object(ictx, id, IOMMUFD_OBJ_VDEVICE), struct iommufd_vdevice, obj); } #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); void iommufd_selftest_destroy(struct iommufd_object *obj); extern size_t iommufd_test_memory_limit; void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, unsigned int ioas_id, u64 *iova, u32 *flags); bool iommufd_should_fail(void); int __init iommufd_test_init(void); void iommufd_test_exit(void); bool iommufd_selftest_is_mock_dev(struct device *dev); int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, struct dma_buf_phys_vec *phys); #else static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, unsigned int ioas_id, u64 *iova, u32 *flags) { } static inline bool iommufd_should_fail(void) { return false; } static inline int __init iommufd_test_init(void) { return 0; } static inline void iommufd_test_exit(void) { } static inline bool iommufd_selftest_is_mock_dev(struct device *dev) { return false; } static inline int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, struct dma_buf_phys_vec *phys) { return -EOPNOTSUPP; } #endif #endif
8 21 7 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_EXPORTFS_H #define LINUX_EXPORTFS_H 1 #include <linux/types.h> #include <linux/path.h> struct dentry; struct iattr; struct inode; struct iomap; struct super_block; struct vfsmount; /* limit the handle size to NFSv4 handle size now */ #define MAX_HANDLE_SZ 128 /* * The fileid_type identifies how the file within the filesystem is encoded. * In theory this is freely set and parsed by the filesystem, but we try to * stick to conventions so we can share some generic code and don't confuse * sniffers like ethereal/wireshark. * * The filesystem must not use the value '0' or '0xff'. */ enum fid_type { /* * The root, or export point, of the filesystem. * (Never actually passed down to the filesystem. */ FILEID_ROOT = 0, /* * 32bit inode number, 32 bit generation number. */ FILEID_INO32_GEN = 1, /* * 32bit inode number, 32 bit generation number, * 32 bit parent directory inode number. */ FILEID_INO32_GEN_PARENT = 2, /* * 64 bit object ID, 64 bit root object ID, * 32 bit generation number. */ FILEID_BTRFS_WITHOUT_PARENT = 0x4d, /* * 64 bit object ID, 64 bit root object ID, * 32 bit generation number, * 64 bit parent object ID, 32 bit parent generation. */ FILEID_BTRFS_WITH_PARENT = 0x4e, /* * 64 bit object ID, 64 bit root object ID, * 32 bit generation number, * 64 bit parent object ID, 32 bit parent generation, * 64 bit parent root object ID. */ FILEID_BTRFS_WITH_PARENT_ROOT = 0x4f, /* * 32 bit block number, 16 bit partition reference, * 16 bit unused, 32 bit generation number. */ FILEID_UDF_WITHOUT_PARENT = 0x51, /* * 32 bit block number, 16 bit partition reference, * 16 bit unused, 32 bit generation number, * 32 bit parent block number, 32 bit parent generation number */ FILEID_UDF_WITH_PARENT = 0x52, /* * 64 bit checkpoint number, 64 bit inode number, * 32 bit generation number. */ FILEID_NILFS_WITHOUT_PARENT = 0x61, /* * 64 bit checkpoint number, 64 bit inode number, * 32 bit generation number, 32 bit parent generation. * 64 bit parent inode number. */ FILEID_NILFS_WITH_PARENT = 0x62, /* * 32 bit generation number, 40 bit i_pos. */ FILEID_FAT_WITHOUT_PARENT = 0x71, /* * 32 bit generation number, 40 bit i_pos, * 32 bit parent generation number, 40 bit parent i_pos */ FILEID_FAT_WITH_PARENT = 0x72, /* * 64 bit inode number, 32 bit generation number. */ FILEID_INO64_GEN = 0x81, /* * 64 bit inode number, 32 bit generation number, * 64 bit parent inode number, 32 bit parent generation. */ FILEID_INO64_GEN_PARENT = 0x82, /* * 128 bit child FID (struct lu_fid) * 128 bit parent FID (struct lu_fid) */ FILEID_LUSTRE = 0x97, /* * 64 bit inode number, 32 bit subvolume, 32 bit generation number: */ FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1, FILEID_BCACHEFS_WITH_PARENT = 0xb2, /* * * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number. */ FILEID_NSFS = 0xf1, /* * 64 bit unique kernfs id */ FILEID_KERNFS = 0xfe, /* * Filesystems must not use 0xff file ID. */ FILEID_INVALID = 0xff, }; struct fid { union { struct { u32 ino; u32 gen; u32 parent_ino; u32 parent_gen; } i32; struct { u64 ino; u32 gen; } __packed i64; struct { u32 block; u16 partref; u16 parent_partref; u32 generation; u32 parent_block; u32 parent_generation; } udf; DECLARE_FLEX_ARRAY(__u32, raw); }; }; enum handle_to_path_flags { HANDLE_CHECK_PERMS = (1 << 0), HANDLE_CHECK_SUBTREE = (1 << 1), }; struct handle_to_path_ctx { struct path root; enum handle_to_path_flags flags; unsigned int fh_flags; }; #define EXPORT_FH_CONNECTABLE 0x1 /* Encode file handle with parent */ #define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */ #define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */ /* * Filesystems use only lower 8 bits of file_handle type for fid_type. * name_to_handle_at() uses upper 16 bits of type as user flags to be * interpreted by open_by_handle_at(). */ #define FILEID_USER_FLAGS_MASK 0xffff0000 #define FILEID_USER_FLAGS(type) ((type) & FILEID_USER_FLAGS_MASK) /* Flags supported in encoded handle_type that is exported to user */ #define FILEID_IS_CONNECTABLE 0x10000 #define FILEID_IS_DIR 0x20000 #define FILEID_VALID_USER_FLAGS (FILEID_IS_CONNECTABLE | FILEID_IS_DIR) /** * struct export_operations - for nfsd to communicate with file systems * @encode_fh: encode a file handle fragment from a dentry * @fh_to_dentry: find the implied object and get a dentry for it * @fh_to_parent: find the implied object's parent and get a dentry for it * @get_name: find the name for a given inode in a given directory * @get_parent: find the parent of a given directory * @commit_metadata: commit metadata changes to stable storage * * See Documentation/filesystems/nfs/exporting.rst for details on how to use * this interface correctly. * * encode_fh: * @encode_fh should store in the file handle fragment @fh (using at most * @max_len bytes) information that can be used by @decode_fh to recover the * file referred to by the &struct dentry @de. If @flag has CONNECTABLE bit * set, the encode_fh() should store sufficient information so that a good * attempt can be made to find not only the file but also it's place in the * filesystem. This typically means storing a reference to de->d_parent in * the filehandle fragment. encode_fh() should return the fileid_type on * success and on error returns 255 (if the space needed to encode fh is * greater than @max_len*4 bytes). On error @max_len contains the minimum * size(in 4 byte unit) needed to encode the file handle. * * fh_to_dentry: * @fh_to_dentry is given a &struct super_block (@sb) and a file handle * fragment (@fh, @fh_len). It should return a &struct dentry which refers * to the same file that the file handle fragment refers to. If it cannot, * it should return a %NULL pointer if the file cannot be found, or an * %ERR_PTR error code of %ENOMEM if a memory allocation failure occurred. * Any other error code is treated like %NULL, and will cause an %ESTALE error * for callers of exportfs_decode_fh(). * Any suitable dentry can be returned including, if necessary, a new dentry * created with d_alloc_root. The caller can then find any other extant * dentries by following the d_alias links. * * fh_to_parent: * Same as @fh_to_dentry, except that it returns a pointer to the parent * dentry if it was encoded into the filehandle fragment by @encode_fh. * * get_name: * @get_name should find a name for the given @child in the given @parent * directory. The name should be stored in the @name (with the * understanding that it is already pointing to a %NAME_MAX+1 sized * buffer. get_name() should return %0 on success, a negative error code * or error. @get_name will be called without @parent->i_rwsem held. * * get_parent: * @get_parent should find the parent directory for the given @child which * is also a directory. In the event that it cannot be found, or storage * space cannot be allocated, a %ERR_PTR should be returned. * * permission: * Allow filesystems to specify a custom permission function. * * open: * Allow filesystems to specify a custom open function. * * commit_metadata: * @commit_metadata should commit metadata changes to stable storage. * * Locking rules: * get_parent is called with child->d_inode->i_rwsem down * get_name is not (which is possibly inconsistent) */ struct export_operations { int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent); struct dentry * (*fh_to_dentry)(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); struct dentry * (*fh_to_parent)(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); int (*get_name)(struct dentry *parent, char *name, struct dentry *child); struct dentry * (*get_parent)(struct dentry *child); int (*commit_metadata)(struct inode *inode); int (*get_uuid)(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); int (*map_blocks)(struct inode *inode, loff_t offset, u64 len, struct iomap *iomap, bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags); struct file * (*open)(const struct path *path, unsigned int oflags); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ #define EXPORT_OP_REMOTE_FS (0x8) /* Filesystem is remote */ #define EXPORT_OP_NOATOMIC_ATTR (0x10) /* Filesystem cannot supply atomic attribute updates */ #define EXPORT_OP_FLUSH_ON_CLOSE (0x20) /* fs flushes file data on close */ #define EXPORT_OP_NOLOCKS (0x40) /* no file locking support */ unsigned long flags; }; /** * exportfs_cannot_lock() - check if export implements file locking * @export_ops: the nfs export operations to check * * Returns true if the export does not support file locking. */ static inline bool exportfs_cannot_lock(const struct export_operations *export_ops) { return export_ops->flags & EXPORT_OP_NOLOCKS; } extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, int *max_len, struct inode *parent, int flags); extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, int flags); static inline bool exportfs_can_encode_fid(const struct export_operations *nop) { return !nop || nop->encode_fh; } static inline bool exportfs_can_decode_fh(const struct export_operations *nop) { return nop && nop->fh_to_dentry; } static inline bool exportfs_can_encode_fh(const struct export_operations *nop, int fh_flags) { /* * If a non-decodeable file handle was requested, we only need to make * sure that filesystem did not opt-out of encoding fid. */ if (fh_flags & EXPORT_FH_FID) return exportfs_can_encode_fid(nop); /* Normal file handles cannot be created without export ops */ if (!nop) return false; /* * If a connectable file handle was requested, we need to make sure that * filesystem can also decode connected file handles. */ if ((fh_flags & EXPORT_FH_CONNECTABLE) && !nop->fh_to_parent) return false; /* * If a decodeable file handle was requested, we need to make sure that * filesystem can also decode file handles. */ return exportfs_can_decode_fh(nop); } static inline int exportfs_encode_fid(struct inode *inode, struct fid *fid, int *max_len) { return exportfs_encode_inode_fh(inode, fid, max_len, NULL, EXPORT_FH_FID); } extern struct dentry *exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, int fileid_type, unsigned int flags, int (*acceptable)(void *, struct dentry *), void *context); extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, int fh_len, int fileid_type, int (*acceptable)(void *, struct dentry *), void *context); /* * Generic helpers for filesystems. */ int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent); struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); #endif /* LINUX_EXPORTFS_H */
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 // SPDX-License-Identifier: GPL-2.0-only /* * CAN driver for PEAK System PCAN-USB adapter * Derived from the PCAN project file driver/src/pcan_usb.c * * Copyright (C) 2003-2025 PEAK System-Technik GmbH * Author: Stéphane Grosjean <stephane.grosjean@hms-networks.com> * * Many thanks to Klaus Hitschler <klaus.hitschler@gmx.de> */ #include <linux/unaligned.h> #include <linux/ethtool.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/usb.h> #include <linux/can.h> #include <linux/can/dev.h> #include <linux/can/error.h> #include "pcan_usb_core.h" /* PCAN-USB Endpoints */ #define PCAN_USB_EP_CMDOUT 1 #define PCAN_USB_EP_CMDIN (PCAN_USB_EP_CMDOUT | USB_DIR_IN) #define PCAN_USB_EP_MSGOUT 2 #define PCAN_USB_EP_MSGIN (PCAN_USB_EP_MSGOUT | USB_DIR_IN) /* PCAN-USB command struct */ #define PCAN_USB_CMD_FUNC 0 #define PCAN_USB_CMD_NUM 1 #define PCAN_USB_CMD_ARGS 2 #define PCAN_USB_CMD_ARGS_LEN 14 #define PCAN_USB_CMD_LEN (PCAN_USB_CMD_ARGS + \ PCAN_USB_CMD_ARGS_LEN) /* PCAN-USB commands */ #define PCAN_USB_CMD_BITRATE 1 #define PCAN_USB_CMD_SET_BUS 3 #define PCAN_USB_CMD_DEVID 4 #define PCAN_USB_CMD_SN 6 #define PCAN_USB_CMD_REGISTER 9 #define PCAN_USB_CMD_EXT_VCC 10 #define PCAN_USB_CMD_ERR_FR 11 #define PCAN_USB_CMD_LED 12 /* PCAN_USB_CMD_SET_BUS number arg */ #define PCAN_USB_BUS_XCVER 2 #define PCAN_USB_BUS_SILENT_MODE 3 /* PCAN_USB_CMD_xxx functions */ #define PCAN_USB_GET 1 #define PCAN_USB_SET 2 /* PCAN-USB command timeout (ms.) */ #define PCAN_USB_COMMAND_TIMEOUT 1000 /* PCAN-USB startup timeout (ms.) */ #define PCAN_USB_STARTUP_TIMEOUT 10 /* PCAN-USB rx/tx buffers size */ #define PCAN_USB_RX_BUFFER_SIZE 64 #define PCAN_USB_TX_BUFFER_SIZE 64 #define PCAN_USB_MSG_HEADER_LEN 2 #define PCAN_USB_MSG_TX_CAN 2 /* Tx msg is a CAN frame */ /* PCAN-USB adapter internal clock (MHz) */ #define PCAN_USB_CRYSTAL_HZ 16000000 /* PCAN-USB USB message record status/len field */ #define PCAN_USB_STATUSLEN_TIMESTAMP (1 << 7) #define PCAN_USB_STATUSLEN_INTERNAL (1 << 6) #define PCAN_USB_STATUSLEN_EXT_ID (1 << 5) #define PCAN_USB_STATUSLEN_RTR (1 << 4) #define PCAN_USB_STATUSLEN_DLC (0xf) /* PCAN-USB 4.1 CAN Id tx extended flags */ #define PCAN_USB_TX_SRR 0x01 /* SJA1000 SRR command */ #define PCAN_USB_TX_AT 0x02 /* SJA1000 AT command */ /* PCAN-USB error flags */ #define PCAN_USB_ERROR_TXFULL 0x01 #define PCAN_USB_ERROR_RXQOVR 0x02 #define PCAN_USB_ERROR_BUS_LIGHT 0x04 #define PCAN_USB_ERROR_BUS_HEAVY 0x08 #define PCAN_USB_ERROR_BUS_OFF 0x10 #define PCAN_USB_ERROR_RXQEMPTY 0x20 #define PCAN_USB_ERROR_QOVR 0x40 #define PCAN_USB_ERROR_TXQFULL 0x80 #define PCAN_USB_ERROR_BUS (PCAN_USB_ERROR_BUS_LIGHT | \ PCAN_USB_ERROR_BUS_HEAVY | \ PCAN_USB_ERROR_BUS_OFF) /* SJA1000 modes */ #define SJA1000_MODE_NORMAL 0x00 #define SJA1000_MODE_INIT 0x01 /* * tick duration = 42.666 us => * (tick_number * 44739243) >> 20 ~ (tick_number * 42666) / 1000 * accuracy = 10^-7 */ #define PCAN_USB_TS_DIV_SHIFTER 20 #define PCAN_USB_TS_US_PER_TICK 44739243 /* PCAN-USB messages record types */ #define PCAN_USB_REC_ERROR 1 #define PCAN_USB_REC_ANALOG 2 #define PCAN_USB_REC_BUSLOAD 3 #define PCAN_USB_REC_TS 4 #define PCAN_USB_REC_BUSEVT 5 /* CAN bus events notifications selection mask */ #define PCAN_USB_ERR_RXERR 0x02 /* ask for rxerr counter */ #define PCAN_USB_ERR_TXERR 0x04 /* ask for txerr counter */ /* This mask generates an usb packet each time the state of the bus changes. * In other words, its interest is to know which side among rx and tx is * responsible of the change of the bus state. */ #define PCAN_USB_BERR_MASK (PCAN_USB_ERR_RXERR | PCAN_USB_ERR_TXERR) /* identify bus event packets with rx/tx error counters */ #define PCAN_USB_ERR_CNT_DEC 0x00 /* counters are decreasing */ #define PCAN_USB_ERR_CNT_INC 0x80 /* counters are increasing */ /* private to PCAN-USB adapter */ struct pcan_usb { struct peak_usb_device dev; struct peak_time_ref time_ref; struct timer_list restart_timer; struct can_berr_counter bec; }; /* incoming message context for decoding */ struct pcan_usb_msg_context { u16 ts16; u8 prev_ts8; u8 *ptr; u8 *end; u8 rec_cnt; u8 rec_idx; u8 rec_ts_idx; struct net_device *netdev; struct pcan_usb *pdev; }; /* * send a command */ static int pcan_usb_send_cmd(struct peak_usb_device *dev, u8 f, u8 n, u8 *p) { int err; int actual_length; /* usb device unregistered? */ if (!(dev->state & PCAN_USB_STATE_CONNECTED)) return 0; dev->cmd_buf[PCAN_USB_CMD_FUNC] = f; dev->cmd_buf[PCAN_USB_CMD_NUM] = n; if (p) memcpy(dev->cmd_buf + PCAN_USB_CMD_ARGS, p, PCAN_USB_CMD_ARGS_LEN); err = usb_bulk_msg(dev->udev, usb_sndbulkpipe(dev->udev, PCAN_USB_EP_CMDOUT), dev->cmd_buf, PCAN_USB_CMD_LEN, &actual_length, PCAN_USB_COMMAND_TIMEOUT); if (err) netdev_err(dev->netdev, "sending cmd f=0x%x n=0x%x failure: %d\n", f, n, err); return err; } /* * send a command then wait for its response */ static int pcan_usb_wait_rsp(struct peak_usb_device *dev, u8 f, u8 n, u8 *p) { int err; int actual_length; /* usb device unregistered? */ if (!(dev->state & PCAN_USB_STATE_CONNECTED)) return 0; /* first, send command */ err = pcan_usb_send_cmd(dev, f, n, NULL); if (err) return err; err = usb_bulk_msg(dev->udev, usb_rcvbulkpipe(dev->udev, PCAN_USB_EP_CMDIN), dev->cmd_buf, PCAN_USB_CMD_LEN, &actual_length, PCAN_USB_COMMAND_TIMEOUT); if (err) netdev_err(dev->netdev, "waiting rsp f=0x%x n=0x%x failure: %d\n", f, n, err); else if (p) memcpy(p, dev->cmd_buf + PCAN_USB_CMD_ARGS, PCAN_USB_CMD_ARGS_LEN); return err; } static int pcan_usb_set_sja1000(struct peak_usb_device *dev, u8 mode) { u8 args[PCAN_USB_CMD_ARGS_LEN] = { [1] = mode, }; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_REGISTER, PCAN_USB_SET, args); } static int pcan_usb_set_bus(struct peak_usb_device *dev, u8 onoff) { u8 args[PCAN_USB_CMD_ARGS_LEN] = { [0] = !!onoff, }; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_SET_BUS, PCAN_USB_BUS_XCVER, args); } static int pcan_usb_set_silent(struct peak_usb_device *dev, u8 onoff) { u8 args[PCAN_USB_CMD_ARGS_LEN] = { [0] = !!onoff, }; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_SET_BUS, PCAN_USB_BUS_SILENT_MODE, args); } /* send the cmd to be notified from bus errors */ static int pcan_usb_set_err_frame(struct peak_usb_device *dev, u8 err_mask) { u8 args[PCAN_USB_CMD_ARGS_LEN] = { [0] = err_mask, }; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_ERR_FR, PCAN_USB_SET, args); } static int pcan_usb_set_ext_vcc(struct peak_usb_device *dev, u8 onoff) { u8 args[PCAN_USB_CMD_ARGS_LEN] = { [0] = !!onoff, }; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_EXT_VCC, PCAN_USB_SET, args); } static int pcan_usb_set_led(struct peak_usb_device *dev, u8 onoff) { u8 args[PCAN_USB_CMD_ARGS_LEN] = { [0] = !!onoff, }; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_LED, PCAN_USB_SET, args); } /* * set bittiming value to can */ static int pcan_usb_set_bittiming(struct peak_usb_device *dev, struct can_bittiming *bt) { u8 args[PCAN_USB_CMD_ARGS_LEN]; u8 btr0, btr1; btr0 = ((bt->brp - 1) & 0x3f) | (((bt->sjw - 1) & 0x3) << 6); btr1 = ((bt->prop_seg + bt->phase_seg1 - 1) & 0xf) | (((bt->phase_seg2 - 1) & 0x7) << 4); if (dev->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES) btr1 |= 0x80; netdev_info(dev->netdev, "setting BTR0=0x%02x BTR1=0x%02x\n", btr0, btr1); args[0] = btr1; args[1] = btr0; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_BITRATE, PCAN_USB_SET, args); } /* * init/reset can */ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff) { int err; err = pcan_usb_set_bus(dev, onoff); if (err) return err; if (!onoff) { err = pcan_usb_set_sja1000(dev, SJA1000_MODE_INIT); } else { /* the PCAN-USB needs time to init */ set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); } return err; } /* * handle end of waiting for the device to reset */ static void pcan_usb_restart(struct timer_list *t) { struct pcan_usb *pdev = timer_container_of(pdev, t, restart_timer); struct peak_usb_device *dev = &pdev->dev; /* notify candev and netdev */ peak_usb_restart_complete(dev); } /* * handle the submission of the restart urb */ static void pcan_usb_restart_pending(struct urb *urb) { struct pcan_usb *pdev = urb->context; /* the PCAN-USB needs time to restart */ mod_timer(&pdev->restart_timer, jiffies + msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); /* can delete usb resources */ peak_usb_async_complete(urb); } /* * handle asynchronous restart */ static int pcan_usb_restart_async(struct peak_usb_device *dev, struct urb *urb, u8 *buf) { struct pcan_usb *pdev = container_of(dev, struct pcan_usb, dev); if (timer_pending(&pdev->restart_timer)) return -EBUSY; /* set bus on */ buf[PCAN_USB_CMD_FUNC] = 3; buf[PCAN_USB_CMD_NUM] = 2; buf[PCAN_USB_CMD_ARGS] = 1; usb_fill_bulk_urb(urb, dev->udev, usb_sndbulkpipe(dev->udev, PCAN_USB_EP_CMDOUT), buf, PCAN_USB_CMD_LEN, pcan_usb_restart_pending, pdev); return usb_submit_urb(urb, GFP_ATOMIC); } /* * read serial number from device */ static int pcan_usb_get_serial(struct peak_usb_device *dev, u32 *serial_number) { u8 args[PCAN_USB_CMD_ARGS_LEN]; int err; err = pcan_usb_wait_rsp(dev, PCAN_USB_CMD_SN, PCAN_USB_GET, args); if (err) return err; *serial_number = le32_to_cpup((__le32 *)args); return 0; } /* * read can channel id from device */ static int pcan_usb_get_can_channel_id(struct peak_usb_device *dev, u32 *can_ch_id) { u8 args[PCAN_USB_CMD_ARGS_LEN]; int err; err = pcan_usb_wait_rsp(dev, PCAN_USB_CMD_DEVID, PCAN_USB_GET, args); if (err) netdev_err(dev->netdev, "getting can channel id failure: %d\n", err); else *can_ch_id = args[0]; return err; } /* set a new CAN channel id in the flash memory of the device */ static int pcan_usb_set_can_channel_id(struct peak_usb_device *dev, u32 can_ch_id) { u8 args[PCAN_USB_CMD_ARGS_LEN]; /* this kind of device supports 8-bit values only */ if (can_ch_id > U8_MAX) return -EINVAL; /* during the flash process the device disconnects during ~1.25 s.: * prohibit access when interface is UP */ if (dev->netdev->flags & IFF_UP) return -EBUSY; args[0] = can_ch_id; return pcan_usb_send_cmd(dev, PCAN_USB_CMD_DEVID, PCAN_USB_SET, args); } /* * update current time ref with received timestamp */ static int pcan_usb_update_ts(struct pcan_usb_msg_context *mc) { if ((mc->ptr + 2) > mc->end) return -EINVAL; mc->ts16 = get_unaligned_le16(mc->ptr); if (mc->rec_idx > 0) peak_usb_update_ts_now(&mc->pdev->time_ref, mc->ts16); else peak_usb_set_ts_now(&mc->pdev->time_ref, mc->ts16); return 0; } /* * decode received timestamp */ static int pcan_usb_decode_ts(struct pcan_usb_msg_context *mc, u8 first_packet) { /* only 1st packet supplies a word timestamp */ if (first_packet) { if ((mc->ptr + 2) > mc->end) return -EINVAL; mc->ts16 = get_unaligned_le16(mc->ptr); mc->prev_ts8 = mc->ts16 & 0x00ff; mc->ptr += 2; } else { u8 ts8; if ((mc->ptr + 1) > mc->end) return -EINVAL; ts8 = *mc->ptr++; if (ts8 < mc->prev_ts8) mc->ts16 += 0x100; mc->ts16 &= 0xff00; mc->ts16 |= ts8; mc->prev_ts8 = ts8; } return 0; } static int pcan_usb_decode_error(struct pcan_usb_msg_context *mc, u8 n, u8 status_len) { struct sk_buff *skb; struct can_frame *cf; enum can_state new_state = CAN_STATE_ERROR_ACTIVE; /* ignore this error until 1st ts received */ if (n == PCAN_USB_ERROR_QOVR) if (!mc->pdev->time_ref.tick_count) return 0; /* allocate an skb to store the error frame */ skb = alloc_can_err_skb(mc->netdev, &cf); if (n & PCAN_USB_ERROR_RXQOVR) { /* data overrun interrupt */ netdev_dbg(mc->netdev, "data overrun interrupt\n"); mc->netdev->stats.rx_over_errors++; mc->netdev->stats.rx_errors++; if (cf) { cf->can_id |= CAN_ERR_CRTL; cf->data[1] |= CAN_ERR_CRTL_RX_OVERFLOW; } } if (n & PCAN_USB_ERROR_TXQFULL) netdev_dbg(mc->netdev, "device Tx queue full)\n"); if (n & PCAN_USB_ERROR_BUS_OFF) { new_state = CAN_STATE_BUS_OFF; } else if (n & PCAN_USB_ERROR_BUS_HEAVY) { new_state = ((mc->pdev->bec.txerr >= 128) || (mc->pdev->bec.rxerr >= 128)) ? CAN_STATE_ERROR_PASSIVE : CAN_STATE_ERROR_WARNING; } else { new_state = CAN_STATE_ERROR_ACTIVE; } /* handle change of state */ if (new_state != mc->pdev->dev.can.state) { enum can_state tx_state = (mc->pdev->bec.txerr >= mc->pdev->bec.rxerr) ? new_state : 0; enum can_state rx_state = (mc->pdev->bec.txerr <= mc->pdev->bec.rxerr) ? new_state : 0; can_change_state(mc->netdev, cf, tx_state, rx_state); if (new_state == CAN_STATE_BUS_OFF) { can_bus_off(mc->netdev); } else if (cf && (cf->can_id & CAN_ERR_CRTL)) { /* Supply TX/RX error counters in case of * controller error. */ cf->can_id = CAN_ERR_CNT; cf->data[6] = mc->pdev->bec.txerr; cf->data[7] = mc->pdev->bec.rxerr; } } if (!skb) return -ENOMEM; if (status_len & PCAN_USB_STATUSLEN_TIMESTAMP) { struct skb_shared_hwtstamps *hwts = skb_hwtstamps(skb); peak_usb_get_ts_time(&mc->pdev->time_ref, mc->ts16, &hwts->hwtstamp); } netif_rx(skb); return 0; } /* decode bus event usb packet: first byte contains rxerr while 2nd one contains * txerr. */ static int pcan_usb_handle_bus_evt(struct pcan_usb_msg_context *mc, u8 ir) { struct pcan_usb *pdev = mc->pdev; /* according to the content of the packet */ switch (ir) { case PCAN_USB_ERR_CNT_DEC: case PCAN_USB_ERR_CNT_INC: /* save rx/tx error counters from in the device context */ pdev->bec.rxerr = mc->ptr[1]; pdev->bec.txerr = mc->ptr[2]; break; default: /* reserved */ break; } return 0; } /* * decode non-data usb message */ static int pcan_usb_decode_status(struct pcan_usb_msg_context *mc, u8 status_len) { u8 rec_len = status_len & PCAN_USB_STATUSLEN_DLC; u8 f, n; int err; /* check whether function and number can be read */ if ((mc->ptr + 2) > mc->end) return -EINVAL; f = mc->ptr[PCAN_USB_CMD_FUNC]; n = mc->ptr[PCAN_USB_CMD_NUM]; mc->ptr += PCAN_USB_CMD_ARGS; if (status_len & PCAN_USB_STATUSLEN_TIMESTAMP) { int err = pcan_usb_decode_ts(mc, !mc->rec_ts_idx); if (err) return err; /* Next packet in the buffer will have a timestamp on a single * byte */ mc->rec_ts_idx++; } switch (f) { case PCAN_USB_REC_ERROR: err = pcan_usb_decode_error(mc, n, status_len); if (err) return err; break; case PCAN_USB_REC_ANALOG: /* analog values (ignored) */ rec_len = 2; break; case PCAN_USB_REC_BUSLOAD: /* bus load (ignored) */ rec_len = 1; break; case PCAN_USB_REC_TS: /* only timestamp */ if (pcan_usb_update_ts(mc)) return -EINVAL; break; case PCAN_USB_REC_BUSEVT: /* bus event notifications (get rxerr/txerr) */ err = pcan_usb_handle_bus_evt(mc, n); if (err) return err; break; default: netdev_err(mc->netdev, "unexpected function %u\n", f); break; } if ((mc->ptr + rec_len) > mc->end) return -EINVAL; mc->ptr += rec_len; return 0; } /* * decode data usb message */ static int pcan_usb_decode_data(struct pcan_usb_msg_context *mc, u8 status_len) { u8 rec_len = status_len & PCAN_USB_STATUSLEN_DLC; struct sk_buff *skb; struct can_frame *cf; struct skb_shared_hwtstamps *hwts; u32 can_id_flags; skb = alloc_can_skb(mc->netdev, &cf); if (!skb) return -ENOMEM; if (status_len & PCAN_USB_STATUSLEN_EXT_ID) { if ((mc->ptr + 4) > mc->end) goto decode_failed; can_id_flags = get_unaligned_le32(mc->ptr); cf->can_id = can_id_flags >> 3 | CAN_EFF_FLAG; mc->ptr += 4; } else { if ((mc->ptr + 2) > mc->end) goto decode_failed; can_id_flags = get_unaligned_le16(mc->ptr); cf->can_id = can_id_flags >> 5; mc->ptr += 2; } can_frame_set_cc_len(cf, rec_len, mc->pdev->dev.can.ctrlmode); /* Only first packet timestamp is a word */ if (pcan_usb_decode_ts(mc, !mc->rec_ts_idx)) goto decode_failed; /* Next packet in the buffer will have a timestamp on a single byte */ mc->rec_ts_idx++; /* read data */ memset(cf->data, 0x0, sizeof(cf->data)); if (status_len & PCAN_USB_STATUSLEN_RTR) { cf->can_id |= CAN_RTR_FLAG; } else { if ((mc->ptr + rec_len) > mc->end) goto decode_failed; memcpy(cf->data, mc->ptr, cf->len); mc->ptr += rec_len; /* Ignore next byte (client private id) if SRR bit is set */ if (can_id_flags & PCAN_USB_TX_SRR) mc->ptr++; /* update statistics */ mc->netdev->stats.rx_bytes += cf->len; } mc->netdev->stats.rx_packets++; /* convert timestamp into kernel time */ hwts = skb_hwtstamps(skb); peak_usb_get_ts_time(&mc->pdev->time_ref, mc->ts16, &hwts->hwtstamp); /* push the skb */ netif_rx(skb); return 0; decode_failed: dev_kfree_skb(skb); return -EINVAL; } /* * process incoming message */ static int pcan_usb_decode_msg(struct peak_usb_device *dev, u8 *ibuf, u32 lbuf) { struct pcan_usb_msg_context mc = { .rec_cnt = ibuf[1], .ptr = ibuf + PCAN_USB_MSG_HEADER_LEN, .end = ibuf + lbuf, .netdev = dev->netdev, .pdev = container_of(dev, struct pcan_usb, dev), }; int err; for (err = 0; mc.rec_idx < mc.rec_cnt && !err; mc.rec_idx++) { u8 sl = *mc.ptr++; /* handle status and error frames here */ if (sl & PCAN_USB_STATUSLEN_INTERNAL) { err = pcan_usb_decode_status(&mc, sl); /* handle normal can frames here */ } else { err = pcan_usb_decode_data(&mc, sl); } } return err; } /* * process any incoming buffer */ static int pcan_usb_decode_buf(struct peak_usb_device *dev, struct urb *urb) { int err = 0; if (urb->actual_length > PCAN_USB_MSG_HEADER_LEN) { err = pcan_usb_decode_msg(dev, urb->transfer_buffer, urb->actual_length); } else if (urb->actual_length > 0) { netdev_err(dev->netdev, "usb message length error (%u)\n", urb->actual_length); err = -EINVAL; } return err; } /* * process outgoing packet */ static int pcan_usb_encode_msg(struct peak_usb_device *dev, struct sk_buff *skb, u8 *obuf, size_t *size) { struct net_device *netdev = dev->netdev; struct net_device_stats *stats = &netdev->stats; struct can_frame *cf = (struct can_frame *)skb->data; u32 can_id_flags = cf->can_id & CAN_ERR_MASK; u8 *pc; obuf[0] = PCAN_USB_MSG_TX_CAN; obuf[1] = 1; /* only one CAN frame is stored in the packet */ pc = obuf + PCAN_USB_MSG_HEADER_LEN; /* status/len byte */ *pc = can_get_cc_dlc(cf, dev->can.ctrlmode); if (cf->can_id & CAN_RTR_FLAG) *pc |= PCAN_USB_STATUSLEN_RTR; /* can id */ if (cf->can_id & CAN_EFF_FLAG) { *pc |= PCAN_USB_STATUSLEN_EXT_ID; pc++; can_id_flags <<= 3; if (dev->can.ctrlmode & CAN_CTRLMODE_LOOPBACK) can_id_flags |= PCAN_USB_TX_SRR; if (dev->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT) can_id_flags |= PCAN_USB_TX_AT; put_unaligned_le32(can_id_flags, pc); pc += 4; } else { pc++; can_id_flags <<= 5; if (dev->can.ctrlmode & CAN_CTRLMODE_LOOPBACK) can_id_flags |= PCAN_USB_TX_SRR; if (dev->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT) can_id_flags |= PCAN_USB_TX_AT; put_unaligned_le16(can_id_flags, pc); pc += 2; } /* can data */ if (!(cf->can_id & CAN_RTR_FLAG)) { memcpy(pc, cf->data, cf->len); pc += cf->len; } /* SRR bit needs a writer id (useless here) */ if (can_id_flags & PCAN_USB_TX_SRR) *pc++ = 0x80; obuf[(*size)-1] = (u8)(stats->tx_packets & 0xff); return 0; } /* socket callback used to copy berr counters values received through USB */ static int pcan_usb_get_berr_counter(const struct net_device *netdev, struct can_berr_counter *bec) { struct peak_usb_device *dev = netdev_priv(netdev); struct pcan_usb *pdev = container_of(dev, struct pcan_usb, dev); *bec = pdev->bec; /* must return 0 */ return 0; } /* * start interface */ static int pcan_usb_start(struct peak_usb_device *dev) { struct pcan_usb *pdev = container_of(dev, struct pcan_usb, dev); int err; /* number of bits used in timestamps read from adapter struct */ peak_usb_init_time_ref(&pdev->time_ref, &pcan_usb); pdev->bec.rxerr = 0; pdev->bec.txerr = 0; /* always ask the device for BERR reporting, to be able to switch from * WARNING to PASSIVE state */ err = pcan_usb_set_err_frame(dev, PCAN_USB_BERR_MASK); if (err) netdev_warn(dev->netdev, "Asking for BERR reporting error %u\n", err); /* if revision greater than 3, can put silent mode on/off */ if (dev->device_rev > 3) { err = pcan_usb_set_silent(dev, dev->can.ctrlmode & CAN_CTRLMODE_LISTENONLY); if (err) return err; } return pcan_usb_set_ext_vcc(dev, 0); } static int pcan_usb_init(struct peak_usb_device *dev) { struct pcan_usb *pdev = container_of(dev, struct pcan_usb, dev); u32 serial_number; int err; /* initialize a timer needed to wait for hardware restart */ timer_setup(&pdev->restart_timer, pcan_usb_restart, 0); /* * explicit use of dev_xxx() instead of netdev_xxx() here: * information displayed are related to the device itself, not * to the canx netdevice. */ err = pcan_usb_get_serial(dev, &serial_number); if (err) { dev_err(dev->netdev->dev.parent, "unable to read %s serial number (err %d)\n", pcan_usb.name, err); return err; } dev_info(dev->netdev->dev.parent, "PEAK-System %s adapter hwrev %u serial %08X (%u channel)\n", pcan_usb.name, dev->device_rev, serial_number, pcan_usb.ctrl_count); /* Since rev 4.1, PCAN-USB is able to make single-shot as well as * looped back frames. */ if (dev->device_rev >= 41) { struct can_priv *priv = netdev_priv(dev->netdev); priv->ctrlmode_supported |= CAN_CTRLMODE_ONE_SHOT | CAN_CTRLMODE_LOOPBACK; } else { dev_info(dev->netdev->dev.parent, "Firmware update available. Please contact support.peak@hms-networks.com\n"); } return 0; } /* * probe function for new PCAN-USB usb interface */ static int pcan_usb_probe(struct usb_interface *intf) { struct usb_host_interface *if_desc; int i; if_desc = intf->altsetting; /* check interface endpoint addresses */ for (i = 0; i < if_desc->desc.bNumEndpoints; i++) { struct usb_endpoint_descriptor *ep = &if_desc->endpoint[i].desc; switch (ep->bEndpointAddress) { case PCAN_USB_EP_CMDOUT: case PCAN_USB_EP_CMDIN: case PCAN_USB_EP_MSGOUT: case PCAN_USB_EP_MSGIN: break; default: return -ENODEV; } } return 0; } static int pcan_usb_set_phys_id(struct net_device *netdev, enum ethtool_phys_id_state state) { struct peak_usb_device *dev = netdev_priv(netdev); int err = 0; switch (state) { case ETHTOOL_ID_ACTIVE: /* call ON/OFF twice a second */ return 2; case ETHTOOL_ID_OFF: err = pcan_usb_set_led(dev, 0); break; case ETHTOOL_ID_ON: fallthrough; case ETHTOOL_ID_INACTIVE: /* restore LED default */ err = pcan_usb_set_led(dev, 1); break; default: break; } return err; } /* This device only handles 8-bit CAN channel id. */ static int pcan_usb_get_eeprom_len(struct net_device *netdev) { return sizeof(u8); } static const struct ethtool_ops pcan_usb_ethtool_ops = { .set_phys_id = pcan_usb_set_phys_id, .get_ts_info = pcan_get_ts_info, .get_eeprom_len = pcan_usb_get_eeprom_len, .get_eeprom = peak_usb_get_eeprom, .set_eeprom = peak_usb_set_eeprom, }; /* * describe the PCAN-USB adapter */ static const struct can_bittiming_const pcan_usb_const = { .name = "pcan_usb", .tseg1_min = 1, .tseg1_max = 16, .tseg2_min = 1, .tseg2_max = 8, .sjw_max = 4, .brp_min = 1, .brp_max = 64, .brp_inc = 1, }; const struct peak_usb_adapter pcan_usb = { .name = "PCAN-USB", .device_id = PCAN_USB_PRODUCT_ID, .ctrl_count = 1, .ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES | CAN_CTRLMODE_LISTENONLY | CAN_CTRLMODE_CC_LEN8_DLC, .clock = { .freq = PCAN_USB_CRYSTAL_HZ / 2, }, .bittiming_const = &pcan_usb_const, /* size of device private data */ .sizeof_dev_private = sizeof(struct pcan_usb), .ethtool_ops = &pcan_usb_ethtool_ops, /* timestamps usage */ .ts_used_bits = 16, .us_per_ts_scale = PCAN_USB_TS_US_PER_TICK, /* us=(ts*scale) */ .us_per_ts_shift = PCAN_USB_TS_DIV_SHIFTER, /* >> shift */ /* give here messages in/out endpoints */ .ep_msg_in = PCAN_USB_EP_MSGIN, .ep_msg_out = {PCAN_USB_EP_MSGOUT}, /* size of rx/tx usb buffers */ .rx_buffer_size = PCAN_USB_RX_BUFFER_SIZE, .tx_buffer_size = PCAN_USB_TX_BUFFER_SIZE, /* device callbacks */ .intf_probe = pcan_usb_probe, .dev_init = pcan_usb_init, .dev_set_bus = pcan_usb_write_mode, .dev_set_bittiming = pcan_usb_set_bittiming, .dev_get_can_channel_id = pcan_usb_get_can_channel_id, .dev_set_can_channel_id = pcan_usb_set_can_channel_id, .dev_decode_buf = pcan_usb_decode_buf, .dev_encode_msg = pcan_usb_encode_msg, .dev_start = pcan_usb_start, .dev_restart_async = pcan_usb_restart_async, .do_get_berr_counter = pcan_usb_get_berr_counter, };
10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 // SPDX-License-Identifier: GPL-2.0-only /* x_tables module for setting the IPv4/IPv6 DSCP field, Version 1.8 * * (C) 2002 by Harald Welte <laforge@netfilter.org> * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com> * * See RFC2474 for a description of the DSCP field within the IP Header. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/dsfield.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_DSCP.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: DSCP/TOS field modification"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_DSCP"); MODULE_ALIAS("ip6t_DSCP"); MODULE_ALIAS("ipt_TOS"); MODULE_ALIAS("ip6t_TOS"); #define XT_DSCP_ECN_MASK 3u static unsigned int dscp_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_DSCP_info *dinfo = par->targinfo; u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; ipv4_change_dsfield(ip_hdr(skb), XT_DSCP_ECN_MASK, dinfo->dscp << XT_DSCP_SHIFT); } return XT_CONTINUE; } static unsigned int dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_DSCP_info *dinfo = par->targinfo; u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) return NF_DROP; ipv6_change_dsfield(ipv6_hdr(skb), XT_DSCP_ECN_MASK, dinfo->dscp << XT_DSCP_SHIFT); } return XT_CONTINUE; } static int dscp_tg_check(const struct xt_tgchk_param *par) { const struct xt_DSCP_info *info = par->targinfo; if (info->dscp > XT_DSCP_MAX) return -EDOM; return 0; } static unsigned int tos_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tos_target_info *info = par->targinfo; struct iphdr *iph = ip_hdr(skb); u_int8_t orig, nv; orig = ipv4_get_dsfield(iph); nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ip_hdr(skb); ipv4_change_dsfield(iph, 0, nv); } return XT_CONTINUE; } static unsigned int tos_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tos_target_info *info = par->targinfo; struct ipv6hdr *iph = ipv6_hdr(skb); u_int8_t orig, nv; orig = ipv6_get_dsfield(iph); nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ipv6_hdr(skb); ipv6_change_dsfield(iph, 0, nv); } return XT_CONTINUE; } static struct xt_target dscp_tg_reg[] __read_mostly = { { .name = "DSCP", .family = NFPROTO_IPV4, .checkentry = dscp_tg_check, .target = dscp_tg, .targetsize = sizeof(struct xt_DSCP_info), .table = "mangle", .me = THIS_MODULE, }, { .name = "DSCP", .family = NFPROTO_IPV6, .checkentry = dscp_tg_check, .target = dscp_tg6, .targetsize = sizeof(struct xt_DSCP_info), .table = "mangle", .me = THIS_MODULE, }, { .name = "TOS", .revision = 1, .family = NFPROTO_IPV4, .table = "mangle", .target = tos_tg, .targetsize = sizeof(struct xt_tos_target_info), .me = THIS_MODULE, }, { .name = "TOS", .revision = 1, .family = NFPROTO_IPV6, .table = "mangle", .target = tos_tg6, .targetsize = sizeof(struct xt_tos_target_info), .me = THIS_MODULE, }, }; static int __init dscp_tg_init(void) { return xt_register_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg)); } static void __exit dscp_tg_exit(void) { xt_unregister_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg)); } module_init(dscp_tg_init); module_exit(dscp_tg_exit);
7 11 2 10 14 13 14 12 12 12 12 8 1 7 7 2 2 1 1 1 2 1 1 2 2 2 2 2 1 1 2 6 15 15 14 9 14 15 12 6 6 3 3 5 5 12 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2008 Red Hat, Inc. All rights reserved. * Copyright 2008 Ian Kent <raven@themaw.net> */ #include <linux/module.h> #include <linux/miscdevice.h> #include <linux/compat.h> #include <linux/fdtable.h> #include <linux/magic.h> #include <linux/nospec.h> #include "autofs_i.h" /* * This module implements an interface for routing autofs ioctl control * commands via a miscellaneous device file. * * The alternate interface is needed because we need to be able open * an ioctl file descriptor on an autofs mount that may be covered by * another mount. This situation arises when starting automount(8) * or other user space daemon which uses direct mounts or offset * mounts (used for autofs lazy mount/umount of nested mount trees), * which have been left busy at service shutdown. */ typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *, struct autofs_dev_ioctl *); static int check_name(const char *name) { if (!strchr(name, '/')) return -EINVAL; return 0; } /* * Check a string doesn't overrun the chunk of * memory we copied from user land. */ static int invalid_str(char *str, size_t size) { if (memchr(str, 0, size)) return 0; return -EINVAL; } /* * Check that the user compiled against correct version of autofs * misc device code. * * As well as checking the version compatibility this always copies * the kernel interface version out. */ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) { int err = 0; if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) || (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) { pr_warn("ioctl control interface version mismatch: " "kernel(%u.%u), user(%u.%u), cmd(0x%08x)\n", AUTOFS_DEV_IOCTL_VERSION_MAJOR, AUTOFS_DEV_IOCTL_VERSION_MINOR, param->ver_major, param->ver_minor, cmd); err = -EINVAL; } /* Fill in the kernel version. */ param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; return err; } /* * Copy parameter control struct, including a possible path allocated * at the end of the struct. */ static struct autofs_dev_ioctl * copy_dev_ioctl(struct autofs_dev_ioctl __user *in) { struct autofs_dev_ioctl tmp, *res; if (copy_from_user(&tmp, in, AUTOFS_DEV_IOCTL_SIZE)) return ERR_PTR(-EFAULT); if (tmp.size < AUTOFS_DEV_IOCTL_SIZE) return ERR_PTR(-EINVAL); if (tmp.size > AUTOFS_DEV_IOCTL_SIZE + PATH_MAX) return ERR_PTR(-ENAMETOOLONG); res = memdup_user(in, tmp.size); if (!IS_ERR(res)) res->size = tmp.size; return res; } static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) { kfree(param); } /* * Check sanity of parameter control fields and if a path is present * check that it is terminated and contains at least one "/". */ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) { unsigned int inr = _IOC_NR(cmd); int err; err = check_dev_ioctl_version(cmd, param); if (err) { pr_warn("invalid device control module version " "supplied for cmd(0x%08x)\n", cmd); goto out; } if (param->size > AUTOFS_DEV_IOCTL_SIZE) { err = invalid_str(param->path, param->size - AUTOFS_DEV_IOCTL_SIZE); if (err) { pr_warn( "path string terminator missing for cmd(0x%08x)\n", cmd); goto out; } /* Setting the per-dentry expire timeout requires a trailing * path component, ie. no '/', so invert the logic of the * check_name() return for AUTOFS_DEV_IOCTL_TIMEOUT_CMD. */ err = check_name(param->path); if (inr == AUTOFS_DEV_IOCTL_TIMEOUT_CMD) err = err ? 0 : -EINVAL; if (err) { pr_warn("invalid path supplied for cmd(0x%08x)\n", cmd); goto out; } } else { if (inr == AUTOFS_DEV_IOCTL_OPENMOUNT_CMD || inr == AUTOFS_DEV_IOCTL_REQUESTER_CMD || inr == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) { err = -EINVAL; goto out; } } err = 0; out: return err; } /* Return autofs dev ioctl version */ static int autofs_dev_ioctl_version(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { /* This should have already been set. */ param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; return 0; } /* Return autofs module protocol version */ static int autofs_dev_ioctl_protover(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { param->protover.version = sbi->version; return 0; } /* Return autofs module protocol sub version */ static int autofs_dev_ioctl_protosubver(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { param->protosubver.sub_version = sbi->sub_version; return 0; } /* Find the topmost mount satisfying test() */ static int find_autofs_mount(const char *pathname, struct path *res, int test(const struct path *path, void *data), void *data) { struct path path; int err; err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path); if (err) return err; err = -ENOENT; while (path.dentry == path.mnt->mnt_root) { if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) { if (test(&path, data)) { path_get(&path); *res = path; err = 0; break; } } if (!follow_up(&path)) break; } path_put(&path); return err; } static int test_by_dev(const struct path *path, void *p) { return path->dentry->d_sb->s_dev == *(dev_t *)p; } static int test_by_type(const struct path *path, void *p) { struct autofs_info *ino = autofs_dentry_ino(path->dentry); return ino && ino->sbi->type & *(unsigned *)p; } /* * Open a file descriptor on the autofs mount point corresponding * to the given path and device number (aka. new_encode_dev(sb->s_dev)). */ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) { struct path path __free(path_put) = {}; int err; err = find_autofs_mount(name, &path, test_by_dev, &devid); if (err) return err; return FD_ADD(O_CLOEXEC, dentry_open(&path, O_RDONLY, current_cred())); } /* Open a file descriptor on an autofs mount point */ static int autofs_dev_ioctl_openmount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { const char *path; dev_t devid; int err, fd; /* param->path has been checked in validate_dev_ioctl() */ if (!param->openmount.devid) return -EINVAL; param->ioctlfd = -1; path = param->path; devid = new_decode_dev(param->openmount.devid); err = 0; fd = autofs_dev_ioctl_open_mountpoint(path, devid); if (unlikely(fd < 0)) { err = fd; goto out; } param->ioctlfd = fd; out: return err; } /* Close file descriptor allocated above (user can also use close(2)). */ static int autofs_dev_ioctl_closemount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { return close_fd(param->ioctlfd); } /* * Send "ready" status for an existing wait (either a mount or an expire * request). */ static int autofs_dev_ioctl_ready(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { autofs_wqt_t token; token = (autofs_wqt_t) param->ready.token; return autofs_wait_release(sbi, token, 0); } /* * Send "fail" status for an existing wait (either a mount or an expire * request). */ static int autofs_dev_ioctl_fail(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { autofs_wqt_t token; int status; token = (autofs_wqt_t) param->fail.token; status = param->fail.status < 0 ? param->fail.status : -ENOENT; return autofs_wait_release(sbi, token, status); } /* * Set the pipe fd for kernel communication to the daemon. * * Normally this is set at mount using an option but if we * are reconnecting to a busy mount then we need to use this * to tell the autofs mount about the new kernel pipe fd. In * order to protect mounts against incorrectly setting the * pipefd we also require that the autofs mount be catatonic. * * This also sets the process group id used to identify the * controlling process (eg. the owning automount(8) daemon). */ static int autofs_dev_ioctl_setpipefd(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { int pipefd; int err = 0; struct pid *new_pid = NULL; if (param->setpipefd.pipefd == -1) return -EINVAL; pipefd = param->setpipefd.pipefd; mutex_lock(&sbi->wq_mutex); if (!(sbi->flags & AUTOFS_SBI_CATATONIC)) { mutex_unlock(&sbi->wq_mutex); return -EBUSY; } else { struct file *pipe; new_pid = get_task_pid(current, PIDTYPE_PGID); if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { pr_warn("not allowed to change PID namespace\n"); err = -EINVAL; goto out; } pipe = fget(pipefd); if (!pipe) { err = -EBADF; goto out; } if (autofs_prepare_pipe(pipe) < 0) { err = -EPIPE; fput(pipe); goto out; } swap(sbi->oz_pgrp, new_pid); sbi->pipefd = pipefd; sbi->pipe = pipe; sbi->mnt_ns_id = to_ns_common(current->nsproxy->mnt_ns)->ns_id; sbi->flags &= ~AUTOFS_SBI_CATATONIC; } out: put_pid(new_pid); mutex_unlock(&sbi->wq_mutex); return err; } /* * Make the autofs mount point catatonic, no longer responsive to * mount requests. Also closes the kernel pipe file descriptor. */ static int autofs_dev_ioctl_catatonic(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { autofs_catatonic_mode(sbi); return 0; } /* * Set the autofs mount expire timeout. * * There are two places an expire timeout can be set, in the autofs * super block info. (this is all that's needed for direct and offset * mounts because there's a distinct mount corresponding to each of * these) and per-dentry within within the dentry info. If a per-dentry * timeout is set it will override the expire timeout set in the parent * autofs super block info. * * If setting the autofs super block expire timeout the autofs_dev_ioctl * size field will be equal to the autofs_dev_ioctl structure size. If * setting the per-dentry expire timeout the mount point name is passed * in the autofs_dev_ioctl path field and the size field updated to * reflect this. * * Setting the autofs mount expire timeout sets the timeout in the super * block info. struct. Setting the per-dentry timeout does a little more. * If the timeout is equal to -1 the per-dentry timeout (and flag) is * cleared which reverts to using the super block timeout, otherwise if * timeout is 0 the timeout is set to this value and the flag is left * set which disables expiration for the mount point, lastly the flag * and the timeout are set enabling the dentry to use this timeout. */ static int autofs_dev_ioctl_timeout(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { unsigned long timeout = param->timeout.timeout; /* If setting the expire timeout for an individual indirect * mount point dentry the mount trailing component path is * placed in param->path and param->size adjusted to account * for it otherwise param->size it is set to the structure * size. */ if (param->size == AUTOFS_DEV_IOCTL_SIZE) { param->timeout.timeout = sbi->exp_timeout / HZ; sbi->exp_timeout = timeout * HZ; } else { struct dentry *base = fp->f_path.dentry; int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1; struct dentry *dentry; struct autofs_info *ino; if (!autofs_type_indirect(sbi->type)) return -EINVAL; dentry = try_lookup_noperm(&QSTR_LEN(param->path, path_len), base); if (IS_ERR_OR_NULL(dentry)) return dentry ? PTR_ERR(dentry) : -ENOENT; ino = autofs_dentry_ino(dentry); if (!ino) { dput(dentry); return -ENOENT; } if (ino->exp_timeout && ino->flags & AUTOFS_INF_EXPIRE_SET) param->timeout.timeout = ino->exp_timeout / HZ; else param->timeout.timeout = sbi->exp_timeout / HZ; if (timeout == -1) { /* Revert to using the super block timeout */ ino->flags &= ~AUTOFS_INF_EXPIRE_SET; ino->exp_timeout = 0; } else { /* Set the dentry expire flag and timeout. * * If timeout is 0 it will prevent the expire * of this particular automount. */ ino->flags |= AUTOFS_INF_EXPIRE_SET; ino->exp_timeout = timeout * HZ; } /* An expire timeout greater than the superblock timeout * could be a problem at shutdown but the super block * timeout itself can change so all we can really do is * warn the user. */ if (ino->flags & AUTOFS_INF_EXPIRE_SET && ino->exp_timeout > sbi->exp_timeout) pr_warn("per-mount expire timeout is greater than " "the parent autofs mount timeout which could " "prevent shutdown\n"); dput(dentry); } return 0; } /* * Return the uid and gid of the last request for the mount * * When reconstructing an autofs mount tree with active mounts * we need to re-connect to mounts that may have used the original * process uid and gid (or string variations of them) for mount * lookups within the map entry. */ static int autofs_dev_ioctl_requester(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { struct autofs_info *ino; struct path path; dev_t devid; int err = -ENOENT; /* param->path has been checked in validate_dev_ioctl() */ devid = sbi->sb->s_dev; param->requester.uid = param->requester.gid = -1; err = find_autofs_mount(param->path, &path, test_by_dev, &devid); if (err) goto out; ino = autofs_dentry_ino(path.dentry); if (ino) { err = 0; autofs_expire_wait(&path, 0); spin_lock(&sbi->fs_lock); param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); spin_unlock(&sbi->fs_lock); } path_put(&path); out: return err; } /* * Call repeatedly until it returns -EAGAIN, meaning there's nothing * more that can be done. */ static int autofs_dev_ioctl_expire(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { struct vfsmount *mnt; int how; how = param->expire.how; mnt = fp->f_path.mnt; return autofs_do_expire_multi(sbi->sb, mnt, sbi, how); } /* Check if autofs mount point is in use */ static int autofs_dev_ioctl_askumount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { param->askumount.may_umount = 0; if (may_umount(fp->f_path.mnt)) param->askumount.may_umount = 1; return 0; } /* * Check if the given path is a mountpoint. * * If we are supplied with the file descriptor of an autofs * mount we're looking for a specific mount. In this case * the path is considered a mountpoint if it is itself a * mountpoint or contains a mount, such as a multi-mount * without a root mount. In this case we return 1 if the * path is a mount point and the super magic of the covering * mount if there is one or 0 if it isn't a mountpoint. * * If we aren't supplied with a file descriptor then we * lookup the path and check if it is the root of a mount. * If a type is given we are looking for a particular autofs * mount and if we don't find a match we return fail. If the * located path is the root of a mount we return 1 along with * the super magic of the mount or 0 otherwise. * * In both cases the device number (as returned by * new_encode_dev()) is also returned. */ static int autofs_dev_ioctl_ismountpoint(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { struct path path; const char *name; unsigned int type; unsigned int devid, magic; int err = -ENOENT; /* param->path has been checked in validate_dev_ioctl() */ name = param->path; type = param->ismountpoint.in.type; param->ismountpoint.out.devid = devid = 0; param->ismountpoint.out.magic = magic = 0; if (!fp || param->ioctlfd == -1) { if (autofs_type_any(type)) err = kern_path(name, LOOKUP_FOLLOW | LOOKUP_MOUNTPOINT, &path); else err = find_autofs_mount(name, &path, test_by_type, &type); if (err) goto out; devid = new_encode_dev(path.dentry->d_sb->s_dev); err = 0; if (path.mnt->mnt_root == path.dentry) { err = 1; magic = path.dentry->d_sb->s_magic; } } else { dev_t dev = sbi->sb->s_dev; err = find_autofs_mount(name, &path, test_by_dev, &dev); if (err) goto out; devid = new_encode_dev(dev); err = path_has_submounts(&path); if (follow_down_one(&path)) magic = path.dentry->d_sb->s_magic; } param->ismountpoint.out.devid = devid; param->ismountpoint.out.magic = magic; path_put(&path); out: return err; } /* * Our range of ioctl numbers isn't 0 based so we need to shift * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table * lookup. */ #define cmd_idx(cmd) (cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST)) static ioctl_fn lookup_dev_ioctl(unsigned int cmd) { static const ioctl_fn _ioctls[] = { autofs_dev_ioctl_version, autofs_dev_ioctl_protover, autofs_dev_ioctl_protosubver, autofs_dev_ioctl_openmount, autofs_dev_ioctl_closemount, autofs_dev_ioctl_ready, autofs_dev_ioctl_fail, autofs_dev_ioctl_setpipefd, autofs_dev_ioctl_catatonic, autofs_dev_ioctl_timeout, autofs_dev_ioctl_requester, autofs_dev_ioctl_expire, autofs_dev_ioctl_askumount, autofs_dev_ioctl_ismountpoint, }; unsigned int idx = cmd_idx(cmd); if (idx >= ARRAY_SIZE(_ioctls)) return NULL; idx = array_index_nospec(idx, ARRAY_SIZE(_ioctls)); return _ioctls[idx]; } /* ioctl dispatcher */ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user) { struct autofs_dev_ioctl *param; struct file *fp; struct autofs_sb_info *sbi; unsigned int cmd_first, cmd; ioctl_fn fn = NULL; int err = 0; cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST); cmd = _IOC_NR(command); if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) || cmd - cmd_first > AUTOFS_DEV_IOCTL_IOC_COUNT) { return -ENOTTY; } /* Only root can use ioctls other than AUTOFS_DEV_IOCTL_VERSION_CMD * and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD */ if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && cmd != AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD && !capable(CAP_SYS_ADMIN)) return -EPERM; /* Copy the parameters into kernel space. */ param = copy_dev_ioctl(user); if (IS_ERR(param)) return PTR_ERR(param); err = validate_dev_ioctl(command, param); if (err) goto out; fn = lookup_dev_ioctl(cmd); if (!fn) { pr_warn("unknown command 0x%08x\n", command); err = -ENOTTY; goto out; } fp = NULL; sbi = NULL; /* * For obvious reasons the openmount can't have a file * descriptor yet. We don't take a reference to the * file during close to allow for immediate release, * and the same for retrieving ioctl version. */ if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) { struct super_block *sb; fp = fget(param->ioctlfd); if (!fp) { if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) goto cont; err = -EBADF; goto out; } sb = file_inode(fp)->i_sb; if (sb->s_type != &autofs_fs_type) { err = -EINVAL; fput(fp); goto out; } sbi = autofs_sbi(sb); /* * Admin needs to be able to set the mount catatonic in * order to be able to perform the re-open. */ if (!autofs_oz_mode(sbi) && cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) { err = -EACCES; fput(fp); goto out; } } cont: err = fn(fp, sbi, param); if (fp) fput(fp); if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE)) err = -EFAULT; out: free_dev_ioctl(param); return err; } static long autofs_dev_ioctl(struct file *file, unsigned int command, unsigned long u) { int err; err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u); return (long) err; } #ifdef CONFIG_COMPAT static long autofs_dev_ioctl_compat(struct file *file, unsigned int command, unsigned long u) { return autofs_dev_ioctl(file, command, (unsigned long) compat_ptr(u)); } #else #define autofs_dev_ioctl_compat NULL #endif static const struct file_operations _dev_ioctl_fops = { .unlocked_ioctl = autofs_dev_ioctl, .compat_ioctl = autofs_dev_ioctl_compat, .owner = THIS_MODULE, .llseek = noop_llseek, }; static struct miscdevice _autofs_dev_ioctl_misc = { .minor = AUTOFS_MINOR, .name = AUTOFS_DEVICE_NAME, .fops = &_dev_ioctl_fops, .mode = 0644, }; MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); MODULE_ALIAS("devname:autofs"); /* Register/deregister misc character device */ int __init autofs_dev_ioctl_init(void) { int r; r = misc_register(&_autofs_dev_ioctl_misc); if (r) { pr_err("misc_register failed for control device\n"); return r; } return 0; } void autofs_dev_ioctl_exit(void) { misc_deregister(&_autofs_dev_ioctl_misc); }
2623 2640 2617 3 2625 2623 32 2641 2642 2616 3 2620 2613 32 2636 1780 1787 1791 1774 1757 133 254 255 252 250 28 255 255 255 207 163 162 163 163 207 47 255 253 253 2618 2620 2623 878 593 2030 250 250 1798 1794 5 842 2636 2620 2613 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/realpath.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/magic.h> #include <linux/proc_fs.h> /** * tomoyo_encode2 - Encode binary string to ascii string. * * @str: String in binary format. * @str_len: Size of @str in byte. * * Returns pointer to @str in ascii format on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_encode2(const char *str, int str_len) { int i; int len = 0; const char *p = str; char *cp; char *cp0; if (!p) return NULL; for (i = 0; i < str_len; i++) { const unsigned char c = p[i]; if (c == '\\') len += 2; else if (c > ' ' && c < 127) len++; else len += 4; } len++; /* Reserve space for appending "/". */ cp = kzalloc(len + 10, GFP_NOFS); if (!cp) return NULL; cp0 = cp; p = str; for (i = 0; i < str_len; i++) { const unsigned char c = p[i]; if (c == '\\') { *cp++ = '\\'; *cp++ = '\\'; } else if (c > ' ' && c < 127) { *cp++ = c; } else { *cp++ = '\\'; *cp++ = (c >> 6) + '0'; *cp++ = ((c >> 3) & 7) + '0'; *cp++ = (c & 7) + '0'; } } return cp0; } /** * tomoyo_encode - Encode binary string to ascii string. * * @str: String in binary format. * * Returns pointer to @str in ascii format on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_encode(const char *str) { return str ? tomoyo_encode2(str, strlen(str)) : NULL; } /** * tomoyo_get_absolute_path - Get the path of a dentry but ignores chroot'ed root. * * @path: Pointer to "struct path". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. * * If dentry is a directory, trailing '/' is appended. */ static char *tomoyo_get_absolute_path(const struct path *path, char * const buffer, const int buflen) { char *pos = ERR_PTR(-ENOMEM); if (buflen >= 256) { /* go to whatever namespace root we are under */ pos = d_absolute_path(path, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { struct inode *inode = d_backing_inode(path->dentry); if (inode && S_ISDIR(inode->i_mode)) { buffer[buflen - 2] = '/'; buffer[buflen - 1] = '\0'; } } } return pos; } /** * tomoyo_get_dentry_path - Get the path of a dentry. * * @dentry: Pointer to "struct dentry". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. * * If dentry is a directory, trailing '/' is appended. */ static char *tomoyo_get_dentry_path(struct dentry *dentry, char * const buffer, const int buflen) { char *pos = ERR_PTR(-ENOMEM); if (buflen >= 256) { pos = dentry_path_raw(dentry, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { struct inode *inode = d_backing_inode(dentry); if (inode && S_ISDIR(inode->i_mode)) { buffer[buflen - 2] = '/'; buffer[buflen - 1] = '\0'; } } } return pos; } /** * tomoyo_get_local_path - Get the path of a dentry. * * @dentry: Pointer to "struct dentry". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. */ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer, const int buflen) { struct super_block *sb = dentry->d_sb; char *pos = tomoyo_get_dentry_path(dentry, buffer, buflen); if (IS_ERR(pos)) return pos; /* Convert from $PID to self if $PID is current thread. */ if (sb->s_magic == PROC_SUPER_MAGIC && *pos == '/') { char *ep; const pid_t pid = (pid_t) simple_strtoul(pos + 1, &ep, 10); struct pid_namespace *proc_pidns = proc_pid_ns(sb); if (*ep == '/' && pid && pid == task_tgid_nr_ns(current, proc_pidns)) { pos = ep - 5; if (pos < buffer) goto out; memmove(pos, "/self", 5); } goto prepend_filesystem_name; } /* Use filesystem name for unnamed devices. */ if (!MAJOR(sb->s_dev)) goto prepend_filesystem_name; { struct inode *inode = d_backing_inode(sb->s_root); /* * Use filesystem name if filesystem does not support rename() * operation.