Total coverage: 286825 (18%)of 1618736
3445 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Global definitions for the Ethernet IEEE 802.3 interface. * * Version: @(#)if_ether.h 1.0.1a 02/08/94 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Steve Whitehouse, <gw7rrm@eeshack3.swan.ac.uk> */ #ifndef _LINUX_IF_ETHER_H #define _LINUX_IF_ETHER_H #include <linux/skbuff.h> #include <uapi/linux/if_ether.h> /* XX:XX:XX:XX:XX:XX */ #define MAC_ADDR_STR_LEN (3 * ETH_ALEN - 1) static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_mac_header(skb); } /* Prefer this version in TX path, instead of * skb_reset_mac_header() + eth_hdr() */ static inline struct ethhdr *skb_eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb->data; } static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_inner_mac_header(skb); } int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len); #endif /* _LINUX_IF_ETHER_H */
4 4 4 1 1 1 2 1 2 1 2 1 1 11 5 6 4 3 8 3 1 10 1 1 1 1 2 1 1 2 1 3 1 3 2 1 3 2 1 1 1 54 54 50 2 2 4 2 19 1 2 1 2 4 4 2 4 1 3 8 8 4 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 // SPDX-License-Identifier: GPL-2.0-or-later /* * PTP 1588 clock support - character device implementation. * * Copyright (C) 2010 OMICRON electronics GmbH */ #include <linux/compat.h> #include <linux/module.h> #include <linux/posix-clock.h> #include <linux/poll.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/timekeeping.h> #include <linux/debugfs.h> #include <linux/nospec.h> #include "ptp_private.h" static int ptp_disable_pinfunc(struct ptp_clock_info *ops, enum ptp_pin_function func, unsigned int chan) { struct ptp_clock_request rq; int err = 0; memset(&rq, 0, sizeof(rq)); switch (func) { case PTP_PF_NONE: break; case PTP_PF_EXTTS: rq.type = PTP_CLK_REQ_EXTTS; rq.extts.index = chan; err = ops->enable(ops, &rq, 0); break; case PTP_PF_PEROUT: rq.type = PTP_CLK_REQ_PEROUT; rq.perout.index = chan; err = ops->enable(ops, &rq, 0); break; case PTP_PF_PHYSYNC: break; default: return -EINVAL; } return err; } int ptp_set_pinfunc(struct ptp_clock *ptp, unsigned int pin, enum ptp_pin_function func, unsigned int chan) { struct ptp_clock_info *info = ptp->info; struct ptp_pin_desc *pin1 = NULL, *pin2 = &info->pin_config[pin]; unsigned int i; /* Check to see if any other pin previously had this function. */ for (i = 0; i < info->n_pins; i++) { if (info->pin_config[i].func == func && info->pin_config[i].chan == chan) { pin1 = &info->pin_config[i]; break; } } if (pin1 && i == pin) return 0; /* Check the desired function and channel. */ switch (func) { case PTP_PF_NONE: break; case PTP_PF_EXTTS: if (chan >= info->n_ext_ts) return -EINVAL; break; case PTP_PF_PEROUT: if (chan >= info->n_per_out) return -EINVAL; break; case PTP_PF_PHYSYNC: if (chan != 0) return -EINVAL; break; default: return -EINVAL; } if (info->verify(info, pin, func, chan)) { pr_err("driver cannot use function %u and channel %u on pin %u\n", func, chan, pin); return -EOPNOTSUPP; } /* Disable whatever function was previously assigned. */ if (pin1) { ptp_disable_pinfunc(info, func, chan); pin1->func = PTP_PF_NONE; pin1->chan = 0; } ptp_disable_pinfunc(info, pin2->func, pin2->chan); pin2->func = func; pin2->chan = chan; return 0; } int ptp_open(struct posix_clock_context *pccontext, fmode_t fmode) { struct ptp_clock *ptp = container_of(pccontext->clk, struct ptp_clock, clock); struct timestamp_event_queue *queue; char debugfsname[32]; queue = kzalloc(sizeof(*queue), GFP_KERNEL); if (!queue) return -EINVAL; queue->mask = bitmap_alloc(PTP_MAX_CHANNELS, GFP_KERNEL); if (!queue->mask) { kfree(queue); return -EINVAL; } bitmap_set(queue->mask, 0, PTP_MAX_CHANNELS); spin_lock_init(&queue->lock); scoped_guard(spinlock_irq, &ptp->tsevqs_lock) list_add_tail(&queue->qlist, &ptp->tsevqs); pccontext->private_clkdata = queue; /* Debugfs contents */ sprintf(debugfsname, "0x%p", queue); queue->debugfs_instance = debugfs_create_dir(debugfsname, ptp->debugfs_root); queue->dfs_bitmap.array = (u32 *)queue->mask; queue->dfs_bitmap.n_elements = DIV_ROUND_UP(PTP_MAX_CHANNELS, BITS_PER_BYTE * sizeof(u32)); debugfs_create_u32_array("mask", 0444, queue->debugfs_instance, &queue->dfs_bitmap); return 0; } int ptp_release(struct posix_clock_context *pccontext) { struct timestamp_event_queue *queue = pccontext->private_clkdata; struct ptp_clock *ptp = container_of(pccontext->clk, struct ptp_clock, clock); debugfs_remove(queue->debugfs_instance); pccontext->private_clkdata = NULL; scoped_guard(spinlock_irq, &ptp->tsevqs_lock) list_del(&queue->qlist); bitmap_free(queue->mask); kfree(queue); return 0; } static long ptp_clock_getcaps(struct ptp_clock *ptp, void __user *arg) { struct ptp_clock_caps caps = { .max_adj = ptp->info->max_adj, .n_alarm = ptp->info->n_alarm, .n_ext_ts = ptp->info->n_ext_ts, .n_per_out = ptp->info->n_per_out, .pps = ptp->info->pps, .n_pins = ptp->info->n_pins, .cross_timestamping = ptp->info->getcrosststamp != NULL, .adjust_phase = ptp->info->adjphase != NULL && ptp->info->getmaxphase != NULL, }; if (caps.adjust_phase) caps.max_phase_adj = ptp->info->getmaxphase(ptp->info); return copy_to_user(arg, &caps, sizeof(caps)) ? -EFAULT : 0; } static long ptp_extts_request(struct ptp_clock *ptp, unsigned int cmd, void __user *arg) { struct ptp_clock_request req = { .type = PTP_CLK_REQ_EXTTS }; struct ptp_clock_info *ops = ptp->info; unsigned int supported_extts_flags; if (copy_from_user(&req.extts, arg, sizeof(req.extts))) return -EFAULT; if (cmd == PTP_EXTTS_REQUEST2) { /* Tell the drivers to check the flags carefully. */ req.extts.flags |= PTP_STRICT_FLAGS; /* Make sure no reserved bit is set. */ if ((req.extts.flags & ~PTP_EXTTS_VALID_FLAGS) || req.extts.rsv[0] || req.extts.rsv[1]) return -EINVAL; /* Ensure one of the rising/falling edge bits is set. */ if ((req.extts.flags & PTP_ENABLE_FEATURE) && (req.extts.flags & PTP_EXTTS_EDGES) == 0) return -EINVAL; } else { req.extts.flags &= PTP_EXTTS_V1_VALID_FLAGS; memset(req.extts.rsv, 0, sizeof(req.extts.rsv)); } if (req.extts.index >= ops->n_ext_ts) return -EINVAL; supported_extts_flags = ptp->info->supported_extts_flags; /* The PTP_ENABLE_FEATURE flag is always supported. */ supported_extts_flags |= PTP_ENABLE_FEATURE; /* If the driver does not support strictly checking flags, the * PTP_RISING_EDGE and PTP_FALLING_EDGE flags are merely hints * which are not enforced. */ if (!(supported_extts_flags & PTP_STRICT_FLAGS)) supported_extts_flags |= PTP_EXTTS_EDGES; /* Reject unsupported flags */ if (req.extts.flags & ~supported_extts_flags) return -EOPNOTSUPP; scoped_cond_guard(mutex_intr, return -ERESTARTSYS, &ptp->pincfg_mux) return ops->enable(ops, &req, req.extts.flags & PTP_ENABLE_FEATURE ? 1 : 0); } static long ptp_perout_request(struct ptp_clock *ptp, unsigned int cmd, void __user *arg) { struct ptp_clock_request req = { .type = PTP_CLK_REQ_PEROUT }; struct ptp_perout_request *perout = &req.perout; struct ptp_clock_info *ops = ptp->info; if (copy_from_user(perout, arg, sizeof(*perout))) return -EFAULT; if (cmd == PTP_PEROUT_REQUEST2) { if (perout->flags & ~PTP_PEROUT_VALID_FLAGS) return -EINVAL; /* * The "on" field has undefined meaning if * PTP_PEROUT_DUTY_CYCLE isn't set, we must still treat it * as reserved, which must be set to zero. */ if (!(perout->flags & PTP_PEROUT_DUTY_CYCLE) && !mem_is_zero(perout->rsv, sizeof(perout->rsv))) return -EINVAL; if (perout->flags & PTP_PEROUT_DUTY_CYCLE) { /* The duty cycle must be subunitary. */ if (perout->on.sec > perout->period.sec || (perout->on.sec == perout->period.sec && perout->on.nsec > perout->period.nsec)) return -ERANGE; } if (perout->flags & PTP_PEROUT_PHASE) { /* * The phase should be specified modulo the period, * therefore anything equal or larger than 1 period * is invalid. */ if (perout->phase.sec > perout->period.sec || (perout->phase.sec == perout->period.sec && perout->phase.nsec >= perout->period.nsec)) return -ERANGE; } } else { perout->flags &= PTP_PEROUT_V1_VALID_FLAGS; memset(perout->rsv, 0, sizeof(perout->rsv)); } if (perout->index >= ops->n_per_out) return -EINVAL; if (perout->flags & ~ops->supported_perout_flags) return -EOPNOTSUPP; scoped_cond_guard(mutex_intr, return -ERESTARTSYS, &ptp->pincfg_mux) return ops->enable(ops, &req, perout->period.sec || perout->period.nsec); } static long ptp_enable_pps(struct ptp_clock *ptp, bool enable) { struct ptp_clock_request req = { .type = PTP_CLK_REQ_PPS }; struct ptp_clock_info *ops = ptp->info; if (!capable(CAP_SYS_TIME)) return -EPERM; scoped_cond_guard(mutex_intr, return -ERESTARTSYS, &ptp->pincfg_mux) return ops->enable(ops, &req, enable); } static long ptp_sys_offset_precise(struct ptp_clock *ptp, void __user *arg) { struct ptp_sys_offset_precise precise_offset; struct system_device_crosststamp xtstamp; struct timespec64 ts; int err; if (!ptp->info->getcrosststamp) return -EOPNOTSUPP; err = ptp->info->getcrosststamp(ptp->info, &xtstamp); if (err) return err; memset(&precise_offset, 0, sizeof(precise_offset)); ts = ktime_to_timespec64(xtstamp.device); precise_offset.device.sec = ts.tv_sec; precise_offset.device.nsec = ts.tv_nsec; ts = ktime_to_timespec64(xtstamp.sys_realtime); precise_offset.sys_realtime.sec = ts.tv_sec; precise_offset.sys_realtime.nsec = ts.tv_nsec; ts = ktime_to_timespec64(xtstamp.sys_monoraw); precise_offset.sys_monoraw.sec = ts.tv_sec; precise_offset.sys_monoraw.nsec = ts.tv_nsec; return copy_to_user(arg, &precise_offset, sizeof(precise_offset)) ? -EFAULT : 0; } static long ptp_sys_offset_extended(struct ptp_clock *ptp, void __user *arg) { struct ptp_sys_offset_extended *extoff __free(kfree) = NULL; struct ptp_system_timestamp sts; if (!ptp->info->gettimex64) return -EOPNOTSUPP; extoff = memdup_user(arg, sizeof(*extoff)); if (IS_ERR(extoff)) return PTR_ERR(extoff); if (extoff->n_samples > PTP_MAX_SAMPLES || extoff->rsv[0] || extoff->rsv[1]) return -EINVAL; switch (extoff->clockid) { case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_MONOTONIC_RAW: break; case CLOCK_AUX ... CLOCK_AUX_LAST: if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) break; fallthrough; default: return -EINVAL; } sts.clockid = extoff->clockid; for (unsigned int i = 0; i < extoff->n_samples; i++) { struct timespec64 ts; int err; err = ptp->info->gettimex64(ptp->info, &ts, &sts); if (err) return err; /* Filter out disabled or unavailable clocks */ if (sts.pre_ts.tv_sec < 0 || sts.post_ts.tv_sec < 0) return -EINVAL; extoff->ts[i][0].sec = sts.pre_ts.tv_sec; extoff->ts[i][0].nsec = sts.pre_ts.tv_nsec; extoff->ts[i][1].sec = ts.tv_sec; extoff->ts[i][1].nsec = ts.tv_nsec; extoff->ts[i][2].sec = sts.post_ts.tv_sec; extoff->ts[i][2].nsec = sts.post_ts.tv_nsec; } return copy_to_user(arg, extoff, sizeof(*extoff)) ? -EFAULT : 0; } static long ptp_sys_offset(struct ptp_clock *ptp, void __user *arg) { struct ptp_sys_offset *sysoff __free(kfree) = NULL; struct ptp_clock_time *pct; struct timespec64 ts; sysoff = memdup_user(arg, sizeof(*sysoff)); if (IS_ERR(sysoff)) return PTR_ERR(sysoff); if (sysoff->n_samples > PTP_MAX_SAMPLES) return -EINVAL; pct = &sysoff->ts[0]; for (unsigned int i = 0; i < sysoff->n_samples; i++) { struct ptp_clock_info *ops = ptp->info; int err; ktime_get_real_ts64(&ts); pct->sec = ts.tv_sec; pct->nsec = ts.tv_nsec; pct++; if (ops->gettimex64) err = ops->gettimex64(ops, &ts, NULL); else err = ops->gettime64(ops, &ts); if (err) return err; pct->sec = ts.tv_sec; pct->nsec = ts.tv_nsec; pct++; } ktime_get_real_ts64(&ts); pct->sec = ts.tv_sec; pct->nsec = ts.tv_nsec; return copy_to_user(arg, sysoff, sizeof(*sysoff)) ? -EFAULT : 0; } static long ptp_pin_getfunc(struct ptp_clock *ptp, unsigned int cmd, void __user *arg) { struct ptp_clock_info *ops = ptp->info; struct ptp_pin_desc pd; if (copy_from_user(&pd, arg, sizeof(pd))) return -EFAULT; if (cmd == PTP_PIN_GETFUNC2 && !mem_is_zero(pd.rsv, sizeof(pd.rsv))) return -EINVAL; if (pd.index >= ops->n_pins) return -EINVAL; scoped_cond_guard(mutex_intr, return -ERESTARTSYS, &ptp->pincfg_mux) pd = ops->pin_config[array_index_nospec(pd.index, ops->n_pins)]; return copy_to_user(arg, &pd, sizeof(pd)) ? -EFAULT : 0; } static long ptp_pin_setfunc(struct ptp_clock *ptp, unsigned int cmd, void __user *arg) { struct ptp_clock_info *ops = ptp->info; struct ptp_pin_desc pd; unsigned int pin_index; if (copy_from_user(&pd, arg, sizeof(pd))) return -EFAULT; if (cmd == PTP_PIN_SETFUNC2 && !mem_is_zero(pd.rsv, sizeof(pd.rsv))) return -EINVAL; if (pd.index >= ops->n_pins) return -EINVAL; pin_index = array_index_nospec(pd.index, ops->n_pins); scoped_cond_guard(mutex_intr, return -ERESTARTSYS, &ptp->pincfg_mux) return ptp_set_pinfunc(ptp, pin_index, pd.func, pd.chan); } static long ptp_mask_clear_all(struct timestamp_event_queue *tsevq) { bitmap_clear(tsevq->mask, 0, PTP_MAX_CHANNELS); return 0; } static long ptp_mask_en_single(struct timestamp_event_queue *tsevq, void __user *arg) { unsigned int channel; if (copy_from_user(&channel, arg, sizeof(channel))) return -EFAULT; if (channel >= PTP_MAX_CHANNELS) return -EFAULT; set_bit(channel, tsevq->mask); return 0; } long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, unsigned long arg) { struct ptp_clock *ptp = container_of(pccontext->clk, struct ptp_clock, clock); void __user *argptr; if (in_compat_syscall() && cmd != PTP_ENABLE_PPS && cmd != PTP_ENABLE_PPS2) arg = (unsigned long)compat_ptr(arg); argptr = (void __force __user *)arg; switch (cmd) { case PTP_CLOCK_GETCAPS: case PTP_CLOCK_GETCAPS2: return ptp_clock_getcaps(ptp, argptr); case PTP_EXTTS_REQUEST: case PTP_EXTTS_REQUEST2: if ((pccontext->fp->f_mode & FMODE_WRITE) == 0) return -EACCES; return ptp_extts_request(ptp, cmd, argptr); case PTP_PEROUT_REQUEST: case PTP_PEROUT_REQUEST2: if ((pccontext->fp->f_mode & FMODE_WRITE) == 0) return -EACCES; return ptp_perout_request(ptp, cmd, argptr); case PTP_ENABLE_PPS: case PTP_ENABLE_PPS2: if ((pccontext->fp->f_mode & FMODE_WRITE) == 0) return -EACCES; return ptp_enable_pps(ptp, !!arg); case PTP_SYS_OFFSET_PRECISE: case PTP_SYS_OFFSET_PRECISE2: return ptp_sys_offset_precise(ptp, argptr); case PTP_SYS_OFFSET_EXTENDED: case PTP_SYS_OFFSET_EXTENDED2: return ptp_sys_offset_extended(ptp, argptr); case PTP_SYS_OFFSET: case PTP_SYS_OFFSET2: return ptp_sys_offset(ptp, argptr); case PTP_PIN_GETFUNC: case PTP_PIN_GETFUNC2: return ptp_pin_getfunc(ptp, cmd, argptr); case PTP_PIN_SETFUNC: case PTP_PIN_SETFUNC2: if ((pccontext->fp->f_mode & FMODE_WRITE) == 0) return -EACCES; return ptp_pin_setfunc(ptp, cmd, argptr); case PTP_MASK_CLEAR_ALL: return ptp_mask_clear_all(pccontext->private_clkdata); case PTP_MASK_EN_SINGLE: return ptp_mask_en_single(pccontext->private_clkdata, argptr); default: return -ENOTTY; } } __poll_t ptp_poll(struct posix_clock_context *pccontext, struct file *fp, poll_table *wait) { struct ptp_clock *ptp = container_of(pccontext->clk, struct ptp_clock, clock); struct timestamp_event_queue *queue; queue = pccontext->private_clkdata; if (!queue) return EPOLLERR; poll_wait(fp, &ptp->tsev_wq, wait); return queue_cnt(queue) ? EPOLLIN : 0; } #define EXTTS_BUFSIZE (PTP_BUF_TIMESTAMPS * sizeof(struct ptp_extts_event)) ssize_t ptp_read(struct posix_clock_context *pccontext, uint rdflags, char __user *buf, size_t cnt) { struct ptp_clock *ptp = container_of(pccontext->clk, struct ptp_clock, clock); struct timestamp_event_queue *queue; struct ptp_extts_event *event; ssize_t result; queue = pccontext->private_clkdata; if (!queue) return -EINVAL; if (cnt % sizeof(*event) != 0) return -EINVAL; if (cnt > EXTTS_BUFSIZE) cnt = EXTTS_BUFSIZE; if (wait_event_interruptible(ptp->tsev_wq, ptp->defunct || queue_cnt(queue))) return -ERESTARTSYS; if (ptp->defunct) return -ENODEV; event = kmalloc(EXTTS_BUFSIZE, GFP_KERNEL); if (!event) return -ENOMEM; scoped_guard(spinlock_irq, &queue->lock) { size_t qcnt = min((size_t)queue_cnt(queue), cnt / sizeof(*event)); for (size_t i = 0; i < qcnt; i++) { event[i] = queue->buf[queue->head]; /* Paired with READ_ONCE() in queue_cnt() */ WRITE_ONCE(queue->head, (queue->head + 1) % PTP_MAX_TIMESTAMPS); } cnt = qcnt * sizeof(*event); } result = cnt; if (copy_to_user(buf, event, cnt)) result = -EFAULT; kfree(event); return result; }
804 800 799 809 2 2 55 199 40 200 392 1109 467 383 63 36 203 191 194 12 12 38 21 1 37 24 23 2 106 106 67 40 11 32 33 9 29 33 24 25 25 7 21 25 107 107 75 75 108 109 1 1 1 1 1 381 488 40 799 476 330 32 302 129 57 2 71 674 11 11 139 137 11 137 137 137 137 3 134 123 54 1019 632 1010 1001 123 902 161 1009 1012 10 996 4 7 633 453 3 1019 136 138 2 1146 1151 287 1146 1155 1155 1 1 1 1 116 99 929 1 41 1190 7 2 1187 4 763 670 109 402 124 1155 2 130 2 4 1286 1290 1296 4 47 124 122 2 72 1104 1147 1155 1125 1051 1110 382 720 27 27 742 1113 1279 1 10 23 23 12 11 9 1 1 9 123 92 553 427 381 525 359 314 293 394 23 124 124 5 108 20 907 557 788 5 404 26 25 5 47 47 47 403 384 403 89 19 20 399 360 108 375 7 7 7 353 339 48 4 98 25 25 92 85 102 102 102 101 102 100 464 351 123 26 98 1399 1380 22 1396 293 293 123 124 29 11 3 4 2 368 776 18 771 6 765 183 642 40 40 16 40 39 39 2 96 1 40 897 684 83 989 1058 11 1054 1064 1068 1052 578 485 1068 1054 484 575 1068 587 492 2 11 1068 725 162 293 151 151 435 438 489 488 25 24 19 19 9 1 8 2 4 2 4 2 5 6 5 5 6 3 6 6 6 6 5 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/memfd.h> #include <linux/memremap.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/secretmem.h> #include <linux/sched/signal.h> #include <linux/rwsem.h> #include <linux/hugetlb.h> #include <linux/migrate.h> #include <linux/mm_inline.h> #include <linux/pagevec.h> #include <linux/sched/mm.h> #include <linux/shmem_fs.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> #include "internal.h" #include "swap.h" struct follow_page_context { struct dev_pagemap *pgmap; unsigned int page_mask; }; static inline void sanity_check_pinned_pages(struct page **pages, unsigned long npages) { if (!IS_ENABLED(CONFIG_DEBUG_VM)) return; /* * We only pin anonymous pages if they are exclusive. Once pinned, we * can no longer turn them possibly shared and PageAnonExclusive() will * stick around until the page is freed. * * We'd like to verify that our pinned anonymous pages are still mapped * exclusively. The issue with anon THP is that we don't know how * they are/were mapped when pinning them. However, for anon * THP we can assume that either the given page (PTE-mapped THP) or * the head page (PMD-mapped THP) should be PageAnonExclusive(). If * neither is the case, there is certainly something wrong. */ for (; npages; npages--, pages++) { struct page *page = *pages; struct folio *folio; if (!page) continue; folio = page_folio(page); if (is_zero_page(page) || !folio_test_anon(folio)) continue; if (!folio_test_large(folio) || folio_test_hugetlb(folio)) VM_WARN_ON_ONCE_FOLIO(!PageAnonExclusive(&folio->page), folio); else /* Either a PTE-mapped or a PMD-mapped THP. */ VM_WARN_ON_ONCE_PAGE(!PageAnonExclusive(&folio->page) && !PageAnonExclusive(page), page); } } /* * Return the folio with ref appropriately incremented, * or NULL if that failed. */ static inline struct folio *try_get_folio(struct page *page, int refs) { struct folio *folio; retry: folio = page_folio(page); if (WARN_ON_ONCE(folio_ref_count(folio) < 0)) return NULL; if (unlikely(!folio_ref_try_add(folio, refs))) return NULL; /* * At this point we have a stable reference to the folio; but it * could be that between calling page_folio() and the refcount * increment, the folio was split, in which case we'd end up * holding a reference on a folio that has nothing to do with the page * we were given anymore. * So now that the folio is stable, recheck that the page still * belongs to this folio. */ if (unlikely(page_folio(page) != folio)) { folio_put_refs(folio, refs); goto retry; } return folio; } static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) { if (flags & FOLL_PIN) { if (is_zero_folio(folio)) return; node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); if (folio_has_pincount(folio)) atomic_sub(refs, &folio->_pincount); else refs *= GUP_PIN_COUNTING_BIAS; } folio_put_refs(folio, refs); } /** * try_grab_folio() - add a folio's refcount by a flag-dependent amount * @folio: pointer to folio to be grabbed * @refs: the value to (effectively) add to the folio's refcount * @flags: gup flags: these are the FOLL_* flag values * * This might not do anything at all, depending on the flags argument. * * "grab" names in this file mean, "look at flags to decide whether to use * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same * time. * * Return: 0 for success, or if no action was required (if neither FOLL_PIN * nor FOLL_GET was set, nothing is done). A negative error code for failure: * * -ENOMEM FOLL_GET or FOLL_PIN was set, but the folio could not * be grabbed. * * It is called when we have a stable reference for the folio, typically in * GUP slow path. */ int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags) { if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page))) return -EREMOTEIO; if (flags & FOLL_GET) folio_ref_add(folio, refs); else if (flags & FOLL_PIN) { /* * Don't take a pin on the zero page - it's not going anywhere * and it is used in a *lot* of places. */ if (is_zero_folio(folio)) return 0; /* * Increment the normal page refcount field at least once, * so that the page really is pinned. */ if (folio_has_pincount(folio)) { folio_ref_add(folio, refs); atomic_add(refs, &folio->_pincount); } else { folio_ref_add(folio, refs * GUP_PIN_COUNTING_BIAS); } node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); } return 0; } /** * unpin_user_page() - release a dma-pinned page * @page: pointer to page to be released * * Pages that were pinned via pin_user_pages*() must be released via either * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so * that such pages can be separately tracked and uniquely handled. In * particular, interactions with RDMA and filesystems need special handling. */ void unpin_user_page(struct page *page) { sanity_check_pinned_pages(&page, 1); gup_put_folio(page_folio(page), 1, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_page); /** * unpin_folio() - release a dma-pinned folio * @folio: pointer to folio to be released * * Folios that were pinned via memfd_pin_folios() or other similar routines * must be released either using unpin_folio() or unpin_folios(). */ void unpin_folio(struct folio *folio) { gup_put_folio(folio, 1, FOLL_PIN); } EXPORT_SYMBOL_GPL(unpin_folio); /** * folio_add_pin - Try to get an additional pin on a pinned folio * @folio: The folio to be pinned * * Get an additional pin on a folio we already have a pin on. Makes no change * if the folio is a zero_page. */ void folio_add_pin(struct folio *folio) { if (is_zero_folio(folio)) return; /* * Similar to try_grab_folio(): be sure to *also* increment the normal * page refcount field at least once, so that the page really is * pinned. */ if (folio_has_pincount(folio)) { WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1); folio_ref_inc(folio); atomic_inc(&folio->_pincount); } else { WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS); folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); } } static inline struct folio *gup_folio_range_next(struct page *start, unsigned long npages, unsigned long i, unsigned int *ntails) { struct page *next = nth_page(start, i); struct folio *folio = page_folio(next); unsigned int nr = 1; if (folio_test_large(folio)) nr = min_t(unsigned int, npages - i, folio_nr_pages(folio) - folio_page_idx(folio, next)); *ntails = nr; return folio; } static inline struct folio *gup_folio_next(struct page **list, unsigned long npages, unsigned long i, unsigned int *ntails) { struct folio *folio = page_folio(list[i]); unsigned int nr; for (nr = i + 1; nr < npages; nr++) { if (page_folio(list[nr]) != folio) break; } *ntails = nr - i; return folio; } /** * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages * @pages: array of pages to be maybe marked dirty, and definitely released. * @npages: number of pages in the @pages array. * @make_dirty: whether to mark the pages dirty * * "gup-pinned page" refers to a page that has had one of the get_user_pages() * variants called on that page. * * For each page in the @pages array, make that page (or its head page, if a * compound page) dirty, if @make_dirty is true, and if the page was previously * listed as clean. In any case, releases all pages using unpin_user_page(), * possibly via unpin_user_pages(), for the non-dirty case. * * Please see the unpin_user_page() documentation for details. * * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is * required, then the caller should a) verify that this is really correct, * because _lock() is usually required, and b) hand code it: * set_page_dirty_lock(), unpin_user_page(). * */ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty) { unsigned long i; struct folio *folio; unsigned int nr; if (!make_dirty) { unpin_user_pages(pages, npages); return; } sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); /* * Checking PageDirty at this point may race with * clear_page_dirty_for_io(), but that's OK. Two key * cases: * * 1) This code sees the page as already dirty, so it * skips the call to set_page_dirty(). That could happen * because clear_page_dirty_for_io() called * folio_mkclean(), followed by set_page_dirty(). * However, now the page is going to get written back, * which meets the original intention of setting it * dirty, so all is well: clear_page_dirty_for_io() goes * on to call TestClearPageDirty(), and write the page * back. * * 2) This code sees the page as clean, so it calls * set_page_dirty(). The page stays dirty, despite being * written back, so it gets written back again in the * next writeback cycle. This is harmless. */ if (!folio_test_dirty(folio)) { folio_lock(folio); folio_mark_dirty(folio); folio_unlock(folio); } gup_put_folio(folio, nr, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_pages_dirty_lock); /** * unpin_user_page_range_dirty_lock() - release and optionally dirty * gup-pinned page range * * @page: the starting page of a range maybe marked dirty, and definitely released. * @npages: number of consecutive pages to release. * @make_dirty: whether to mark the pages dirty * * "gup-pinned page range" refers to a range of pages that has had one of the * pin_user_pages() variants called on that page. * * For the page ranges defined by [page .. page+npages], make that range (or * its head pages, if a compound page) dirty, if @make_dirty is true, and if the * page range was previously listed as clean. * * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is * required, then the caller should a) verify that this is really correct, * because _lock() is usually required, and b) hand code it: * set_page_dirty_lock(), unpin_user_page(). * */ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty) { unsigned long i; struct folio *folio; unsigned int nr; for (i = 0; i < npages; i += nr) { folio = gup_folio_range_next(page, npages, i, &nr); if (make_dirty && !folio_test_dirty(folio)) { folio_lock(folio); folio_mark_dirty(folio); folio_unlock(folio); } gup_put_folio(folio, nr, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); static void gup_fast_unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long i; struct folio *folio; unsigned int nr; /* * Don't perform any sanity checks because we might have raced with * fork() and some anonymous pages might now actually be shared -- * which is why we're unpinning after all. */ for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); gup_put_folio(folio, nr, FOLL_PIN); } } /** * unpin_user_pages() - release an array of gup-pinned pages. * @pages: array of pages to be marked dirty and released. * @npages: number of pages in the @pages array. * * For each page in the @pages array, release the page using unpin_user_page(). * * Please see the unpin_user_page() documentation for details. */ void unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long i; struct folio *folio; unsigned int nr; /* * If this WARN_ON() fires, then the system *might* be leaking pages (by * leaving them pinned), but probably not. More likely, gup/pup returned * a hard -ERRNO error to the caller, who erroneously passed it here. */ if (WARN_ON(IS_ERR_VALUE(npages))) return; sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { if (!pages[i]) { nr = 1; continue; } folio = gup_folio_next(pages, npages, i, &nr); gup_put_folio(folio, nr, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_pages); /** * unpin_user_folio() - release pages of a folio * @folio: pointer to folio to be released * @npages: number of pages of same folio * * Release npages of the folio */ void unpin_user_folio(struct folio *folio, unsigned long npages) { gup_put_folio(folio, npages, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_folio); /** * unpin_folios() - release an array of gup-pinned folios. * @folios: array of folios to be marked dirty and released. * @nfolios: number of folios in the @folios array. * * For each folio in the @folios array, release the folio using gup_put_folio. * * Please see the unpin_folio() documentation for details. */ void unpin_folios(struct folio **folios, unsigned long nfolios) { unsigned long i = 0, j; /* * If this WARN_ON() fires, then the system *might* be leaking folios * (by leaving them pinned), but probably not. More likely, gup/pup * returned a hard -ERRNO error to the caller, who erroneously passed * it here. */ if (WARN_ON(IS_ERR_VALUE(nfolios))) return; while (i < nfolios) { for (j = i + 1; j < nfolios; j++) if (folios[i] != folios[j]) break; if (folios[i]) gup_put_folio(folios[i], j - i, FOLL_PIN); i = j; } } EXPORT_SYMBOL_GPL(unpin_folios); /* * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's * lifecycle. Avoid setting the bit unless necessary, or it might cause write * cache bouncing on large SMP machines for concurrent pinned gups. */ static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) { if (!test_bit(MMF_HAS_PINNED, mm_flags)) set_bit(MMF_HAS_PINNED, mm_flags); } #ifdef CONFIG_MMU #ifdef CONFIG_HAVE_GUP_FAST static int record_subpages(struct page *page, unsigned long sz, unsigned long addr, unsigned long end, struct page **pages) { struct page *start_page; int nr; start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT); for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) pages[nr] = nth_page(start_page, nr); return nr; } /** * try_grab_folio_fast() - Attempt to get or pin a folio in fast path. * @page: pointer to page to be grabbed * @refs: the value to (effectively) add to the folio's refcount * @flags: gup flags: these are the FOLL_* flag values. * * "grab" names in this file mean, "look at flags to decide whether to use * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the * same time. (That's true throughout the get_user_pages*() and * pin_user_pages*() APIs.) Cases: * * FOLL_GET: folio's refcount will be incremented by @refs. * * FOLL_PIN on large folios: folio's refcount will be incremented by * @refs, and its pincount will be incremented by @refs. * * FOLL_PIN on single-page folios: folio's refcount will be incremented by * @refs * GUP_PIN_COUNTING_BIAS. * * Return: The folio containing @page (with refcount appropriately * incremented) for success, or NULL upon failure. If neither FOLL_GET * nor FOLL_PIN was set, that's considered failure, and furthermore, * a likely bug in the caller, so a warning is also emitted. * * It uses add ref unless zero to elevate the folio refcount and must be called * in fast path only. */ static struct folio *try_grab_folio_fast(struct page *page, int refs, unsigned int flags) { struct folio *folio; /* Raise warn if it is not called in fast GUP */ VM_WARN_ON_ONCE(!irqs_disabled()); if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) return NULL; if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) return NULL; if (flags & FOLL_GET) return try_get_folio(page, refs); /* FOLL_PIN is set */ /* * Don't take a pin on the zero page - it's not going anywhere * and it is used in a *lot* of places. */ if (is_zero_page(page)) return page_folio(page); folio = try_get_folio(page, refs); if (!folio) return NULL; /* * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a * right zone, so fail and let the caller fall back to the slow * path. */ if (unlikely((flags & FOLL_LONGTERM) && !folio_is_longterm_pinnable(folio))) { folio_put_refs(folio, refs); return NULL; } /* * When pinning a large folio, use an exact count to track it. * * However, be sure to *also* increment the normal folio * refcount field at least once, so that the folio really * is pinned. That's why the refcount from the earlier * try_get_folio() is left intact. */ if (folio_has_pincount(folio)) atomic_add(refs, &folio->_pincount); else folio_ref_add(folio, refs * (GUP_PIN_COUNTING_BIAS - 1)); /* * Adjust the pincount before re-checking the PTE for changes. * This is essentially a smp_mb() and is paired with a memory * barrier in folio_try_share_anon_rmap_*(). */ smp_mb__after_atomic(); node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); return folio; } #endif /* CONFIG_HAVE_GUP_FAST */ /* Common code for can_follow_write_* */ static inline bool can_follow_write_common(struct page *page, struct vm_area_struct *vma, unsigned int flags) { /* Maybe FOLL_FORCE is set to override it? */ if (!(flags & FOLL_FORCE)) return false; /* But FOLL_FORCE has no effect on shared mappings */ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) return false; /* ... or read-only private ones */ if (!(vma->vm_flags & VM_MAYWRITE)) return false; /* ... or already writable ones that just need to take a write fault */ if (vma->vm_flags & VM_WRITE) return false; /* * See can_change_pte_writable(): we broke COW and could map the page * writable if we have an exclusive anonymous page ... */ return page && PageAnon(page) && PageAnonExclusive(page); } static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags, unsigned long address) { if (!(flags & FOLL_DUMP)) return NULL; /* * When core dumping, we don't want to allocate unnecessary pages or * page tables. Return error instead of NULL to skip handle_mm_fault, * then get_dump_page() will return NULL to leave a hole in the dump. * But we can only make this optimization where a hole would surely * be zero-filled if handle_mm_fault() actually did handle it. */ if (is_vm_hugetlb_page(vma)) { struct hstate *h = hstate_vma(vma); if (!hugetlbfs_pagecache_present(h, vma, address)) return ERR_PTR(-EFAULT); } else if ((vma_is_anonymous(vma) || !vma->vm_ops->fault)) { return ERR_PTR(-EFAULT); } return NULL; } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES /* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */ static inline bool can_follow_write_pud(pud_t pud, struct page *page, struct vm_area_struct *vma, unsigned int flags) { /* If the pud is writable, we can write to the page. */ if (pud_write(pud)) return true; return can_follow_write_common(page, vma, flags); } static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, int flags, struct follow_page_context *ctx) { struct mm_struct *mm = vma->vm_mm; struct page *page; pud_t pud = *pudp; unsigned long pfn = pud_pfn(pud); int ret; assert_spin_locked(pud_lockptr(mm, pudp)); if (!pud_present(pud)) return NULL; if ((flags & FOLL_WRITE) && !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags)) return NULL; pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; page = pfn_to_page(pfn); if (!pud_write(pud) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); ret = try_grab_folio(page_folio(page), 1, flags); if (ret) page = ERR_PTR(ret); else ctx->page_mask = HPAGE_PUD_NR - 1; return page; } /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, struct vm_area_struct *vma, unsigned int flags) { /* If the pmd is writable, we can write to the page. */ if (pmd_write(pmd)) return true; if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ if (pmd_needs_soft_dirty_wp(vma, pmd)) return false; return !userfaultfd_huge_pmd_wp(vma, pmd); } static struct page *follow_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags, struct follow_page_context *ctx) { struct mm_struct *mm = vma->vm_mm; pmd_t pmdval = *pmd; struct page *page; int ret; assert_spin_locked(pmd_lockptr(mm, pmd)); page = pmd_page(pmdval); if ((flags & FOLL_WRITE) && !can_follow_write_pmd(pmdval, page, vma, flags)) return NULL; /* Avoid dumping huge zero page */ if ((flags & FOLL_DUMP) && is_huge_zero_pmd(pmdval)) return ERR_PTR(-EFAULT); if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags)) return NULL; if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); ret = try_grab_folio(page_folio(page), 1, flags); if (ret) return ERR_PTR(ret); #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(pmdval) && (flags & FOLL_TOUCH)) touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; ctx->page_mask = HPAGE_PMD_NR - 1; return page; } #else /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */ static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, int flags, struct follow_page_context *ctx) { return NULL; } static struct page *follow_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags, struct follow_page_context *ctx) { return NULL; } #endif /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { if (flags & FOLL_TOUCH) { pte_t orig_entry = ptep_get(pte); pte_t entry = orig_entry; if (flags & FOLL_WRITE) entry = pte_mkdirty(entry); entry = pte_mkyoung(entry); if (!pte_same(orig_entry, entry)) { set_pte_at(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); } } /* Proper page table entry exists, but no corresponding struct page */ return -EEXIST; } /* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */ static inline bool can_follow_write_pte(pte_t pte, struct page *page, struct vm_area_struct *vma, unsigned int flags) { /* If the pte is writable, we can write to the page. */ if (pte_write(pte)) return true; if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ if (pte_needs_soft_dirty_wp(vma, pte)) return false; return !userfaultfd_pte_wp(vma, pte); } static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, struct dev_pagemap **pgmap) { struct mm_struct *mm = vma->vm_mm; struct folio *folio; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; int ret; ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags, address); pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags)) goto no_page; page = vm_normal_page(vma, address, pte); /* * We only care about anon pages in can_follow_write_pte(). */ if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, page, vma, flags)) { page = NULL; goto out; } if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); goto out; } if (is_zero_pfn(pte_pfn(pte))) { page = pte_page(pte); } else { ret = follow_pfn_pte(vma, address, ptep, flags); page = ERR_PTR(ret); goto out; } } folio = page_folio(page); if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { page = ERR_PTR(-EMLINK); goto out; } VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */ ret = try_grab_folio(folio, 1, flags); if (unlikely(ret)) { page = ERR_PTR(ret); goto out; } /* * We need to make the page accessible if and only if we are going * to access its content (the FOLL_PIN case). Please see * Documentation/core-api/pin_user_pages.rst for details. */ if (flags & FOLL_PIN) { ret = arch_make_folio_accessible(folio); if (ret) { unpin_user_page(page); page = ERR_PTR(ret); goto out; } } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !folio_test_dirty(folio)) folio_mark_dirty(folio); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * folio_mark_accessed(). */ folio_mark_accessed(folio); } out: pte_unmap_unlock(ptep, ptl); return page; no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return NULL; return no_page_table(vma, flags, address); } static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, unsigned int flags, struct follow_page_context *ctx) { pmd_t *pmd, pmdval; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; pmd = pmd_offset(pudp, address); pmdval = pmdp_get_lockless(pmd); if (pmd_none(pmdval)) return no_page_table(vma, flags, address); if (!pmd_present(pmdval)) return no_page_table(vma, flags, address); if (likely(!pmd_leaf(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) return no_page_table(vma, flags, address); ptl = pmd_lock(mm, pmd); pmdval = *pmd; if (unlikely(!pmd_present(pmdval))) { spin_unlock(ptl); return no_page_table(vma, flags, address); } if (unlikely(!pmd_leaf(pmdval))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); /* If pmd was left empty, stuff a page table in there quickly */ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) : follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } page = follow_huge_pmd(vma, address, pmd, flags, ctx); spin_unlock(ptl); return page; } static struct page *follow_pud_mask(struct vm_area_struct *vma, unsigned long address, p4d_t *p4dp, unsigned int flags, struct follow_page_context *ctx) { pud_t *pudp, pud; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; pudp = pud_offset(p4dp, address); pud = READ_ONCE(*pudp); if (!pud_present(pud)) return no_page_table(vma, flags, address); if (pud_leaf(pud)) { ptl = pud_lock(mm, pudp); page = follow_huge_pud(vma, address, pudp, flags, ctx); spin_unlock(ptl); if (page) return page; return no_page_table(vma, flags, address); } if (unlikely(pud_bad(pud))) return no_page_table(vma, flags, address); return follow_pmd_mask(vma, address, pudp, flags, ctx); } static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned long address, pgd_t *pgdp, unsigned int flags, struct follow_page_context *ctx) { p4d_t *p4dp, p4d; p4dp = p4d_offset(pgdp, address); p4d = READ_ONCE(*p4dp); BUILD_BUG_ON(p4d_leaf(p4d)); if (!p4d_present(p4d) || p4d_bad(p4d)) return no_page_table(vma, flags, address); return follow_pud_mask(vma, address, p4dp, flags, ctx); } /** * follow_page_mask - look up a page descriptor from a user-virtual address * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a * pointer to output page_mask * * @flags can have FOLL_ flags set, defined in <linux/mm.h> * * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches * the device's dev_pagemap metadata to avoid repeating expensive lookups. * * When getting an anonymous page and the caller has to trigger unsharing * of a shared anonymous page first, -EMLINK is returned. The caller should * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only * relevant with FOLL_PIN and !FOLL_WRITE. * * On output, the @ctx->page_mask is set according to the size of the page. * * Return: the mapped (struct page *), %NULL if no mapping exists, or * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct follow_page_context *ctx) { pgd_t *pgd; struct mm_struct *mm = vma->vm_mm; struct page *page; vma_pgtable_walk_begin(vma); ctx->page_mask = 0; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) page = no_page_table(vma, flags, address); else page = follow_p4d_mask(vma, address, pgd, flags, ctx); vma_pgtable_walk_end(vma); return page; } static int get_gate_page(struct mm_struct *mm, unsigned long address, unsigned int gup_flags, struct vm_area_struct **vma, struct page **page) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; pte_t entry; int ret = -EFAULT; /* user gate pages are read-only */ if (gup_flags & FOLL_WRITE) return -EFAULT; pgd = pgd_offset(mm, address); if (pgd_none(*pgd)) return -EFAULT; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d)) return -EFAULT; pud = pud_offset(p4d, address); if (pud_none(*pud)) return -EFAULT; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return -EFAULT; pte = pte_offset_map(pmd, address); if (!pte) return -EFAULT; entry = ptep_get(pte); if (pte_none(entry)) goto unmap; *vma = get_gate_vma(mm); if (!page) goto out; *page = vm_normal_page(*vma, address, entry); if (!*page) { if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry))) goto unmap; *page = pte_page(entry); } ret = try_grab_folio(page_folio(*page), 1, gup_flags); if (unlikely(ret)) goto unmap; out: ret = 0; unmap: pte_unmap(pte); return ret; } /* * mmap_lock must be held on entry. If @flags has FOLL_UNLOCKABLE but not * FOLL_NOWAIT, the mmap_lock may be released. If it is, *@locked will be set * to 0 and -EBUSY returned. */ static int faultin_page(struct vm_area_struct *vma, unsigned long address, unsigned int flags, bool unshare, int *locked) { unsigned int fault_flags = 0; vm_fault_t ret; if (flags & FOLL_NOFAULT) return -EFAULT; if (flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; if (flags & FOLL_UNLOCKABLE) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE. * That's because some callers may not be prepared to * handle early exits caused by non-fatal signals. */ if (flags & FOLL_INTERRUPTIBLE) fault_flags |= FAULT_FLAG_INTERRUPTIBLE; } if (flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (flags & FOLL_TRIED) { /* * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED * can co-exist */ fault_flags |= FAULT_FLAG_TRIED; } if (unshare) { fault_flags |= FAULT_FLAG_UNSHARE; /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */ VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_WRITE); } ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_COMPLETED) { /* * With FAULT_FLAG_RETRY_NOWAIT we'll never release the * mmap lock in the page fault handler. Sanity check this. */ WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT); *locked = 0; /* * We should do the same as VM_FAULT_RETRY, but let's not * return -EBUSY since that's not reflecting the reality of * what has happened - we've just fully completed a page * fault, with the mmap lock released. Use -EAGAIN to show * that we want to take the mmap lock _again_. */ return -EAGAIN; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, flags); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *locked = 0; return -EBUSY; } return 0; } /* * Writing to file-backed mappings which require folio dirty tracking using GUP * is a fundamentally broken operation, as kernel write access to GUP mappings * do not adhere to the semantics expected by a file system. * * Consider the following scenario:- * * 1. A folio is written to via GUP which write-faults the memory, notifying * the file system and dirtying the folio. * 2. Later, writeback is triggered, resulting in the folio being cleaned and * the PTE being marked read-only. * 3. The GUP caller writes to the folio, as it is mapped read/write via the * direct mapping. * 4. The GUP caller, now done with the page, unpins it and sets it dirty * (though it does not have to). * * This results in both data being written to a folio without writenotify, and * the folio being dirtied unexpectedly (if the caller decides to do so). */ static bool writable_file_mapping_allowed(struct vm_area_struct *vma, unsigned long gup_flags) { /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the case we disallow. */ if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) != (FOLL_PIN | FOLL_LONGTERM)) return true; /* * If the VMA does not require dirty tracking then no problematic write * can occur either. */ return !vma_needs_dirty_tracking(vma); } static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) { vm_flags_t vm_flags = vma->vm_flags; int write = (gup_flags & FOLL_WRITE); int foreign = (gup_flags & FOLL_REMOTE); bool vma_anon = vma_is_anonymous(vma); if (vm_flags & (VM_IO | VM_PFNMAP)) return -EFAULT; if ((gup_flags & FOLL_ANON) && !vma_anon) return -EFAULT; if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) return -EOPNOTSUPP; if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma)) return -EOPNOTSUPP; if (vma_is_secretmem(vma)) return -EFAULT; if (write) { if (!vma_anon && !writable_file_mapping_allowed(vma, gup_flags)) return -EFAULT; if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could * set a breakpoint in a read-only mapping of an * executable, without corrupting the file (yet only * when that file had been opened for writing!). * Anon pages in shared mappings are surprising: now * just reject it. */ if (!is_cow_mapping(vm_flags)) return -EFAULT; } } else if (!(vm_flags & VM_READ)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* * Is there actually any vma we can reach here which does not * have VM_MAYREAD set? */ if (!(vm_flags & VM_MAYREAD)) return -EFAULT; } /* * gups are always data accesses, not instruction * fetches, so execute=false here */ if (!arch_vma_access_permitted(vma, write, false, foreign)) return -EFAULT; return 0; } /* * This is "vma_lookup()", but with a warning if we would have * historically expanded the stack in the GUP code. */ static struct vm_area_struct *gup_vma_lookup(struct mm_struct *mm, unsigned long addr) { #ifdef CONFIG_STACK_GROWSUP return vma_lookup(mm, addr); #else static volatile unsigned long next_warn; struct vm_area_struct *vma; unsigned long now, next; vma = find_vma(mm, addr); if (!vma || (addr >= vma->vm_start)) return vma; /* Only warn for half-way relevant accesses */ if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; if (vma->vm_start - addr > 65536) return NULL; /* Let's not warn more than once an hour.. */ now = jiffies; next = next_warn; if (next && time_before(now, next)) return NULL; next_warn = now + 60*60*HZ; /* Let people know things may have changed. */ pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n", current->comm, task_pid_nr(current), vma->vm_start, vma->vm_end, addr); dump_stack(); return NULL; #endif } /** * __get_user_pages() - pin user pages in memory * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @locked: whether we're still with the mmap_lock held * * Returns either number of pages pinned (which may be less than the * number requested), or an error. Details about the return value: * * -- If nr_pages is 0, returns 0. * -- If nr_pages is >0, but no pages were pinned, returns -errno. * -- If nr_pages is >0, and some pages were pinned, returns the number of * pages pinned. Again, this may be less than nr_pages. * -- 0 return value is possible when the fault would need to be retried. * * The caller is responsible for releasing returned @pages, via put_page(). * * Must be called with mmap_lock held. It may be released. See below. * * __get_user_pages walks a process's page tables and takes a reference to * each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when * __get_user_pages returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re-faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If * the page is written to, set_page_dirty (or set_page_dirty_lock, as * appropriate) must be called after the page is finished with, and * before put_page is called. * * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may * be released. If this happens *@locked will be set to 0 on return. * * A caller using such a combination of @gup_flags must therefore hold the * mmap_lock for reading only, and recognize when it's been released. Otherwise, * it must be held for either reading or writing and will not be released. * * In most cases, get_user_pages or get_user_pages_fast should be used * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ static long __get_user_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; struct follow_page_context ctx = { NULL }; if (!nr_pages) return 0; start = untagged_addr_remote(mm, start); VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); /* FOLL_GET and FOLL_PIN are mutually exclusive. */ VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET)); do { struct page *page; unsigned int page_increm; /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { /* * MADV_POPULATE_(READ|WRITE) wants to handle VMA * lookups+error reporting differently. */ if (gup_flags & FOLL_MADV_POPULATE) { vma = vma_lookup(mm, start); if (!vma) { ret = -ENOMEM; goto out; } if (check_vma_flags(vma, gup_flags)) { ret = -EINVAL; goto out; } goto retry; } vma = gup_vma_lookup(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, pages ? &page : NULL); if (ret) goto out; ctx.page_mask = 0; goto next_page; } if (!vma) { ret = -EFAULT; goto out; } ret = check_vma_flags(vma, gup_flags); if (ret) goto out; } retry: /* * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ if (fatal_signal_pending(current)) { ret = -EINTR; goto out; } cond_resched(); page = follow_page_mask(vma, start, gup_flags, &ctx); if (!page || PTR_ERR(page) == -EMLINK) { ret = faultin_page(vma, start, gup_flags, PTR_ERR(page) == -EMLINK, locked); switch (ret) { case 0: goto retry; case -EBUSY: case -EAGAIN: ret = 0; fallthrough; case -EFAULT: case -ENOMEM: case -EHWPOISON: goto out; } BUG(); } else if (PTR_ERR(page) == -EEXIST) { /* * Proper page table entry exists, but no corresponding * struct page. If the caller expects **pages to be * filled in, bail out now, because that can't be done * for this page. */ if (pages) { ret = PTR_ERR(page); goto out; } } else if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } next_page: page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; if (pages) { struct page *subpage; unsigned int j; /* * This must be a large folio (and doesn't need to * be the whole folio; it can be part of it), do * the refcount work for all the subpages too. * * NOTE: here the page may not be the head page * e.g. when start addr is not thp-size aligned. * try_grab_folio() should have taken care of tail * pages. */ if (page_increm > 1) { struct folio *folio = page_folio(page); /* * Since we already hold refcount on the * large folio, this should never fail. */ if (try_grab_folio(folio, page_increm - 1, gup_flags)) { /* * Release the 1st page ref if the * folio is problematic, fail hard. */ gup_put_folio(folio, 1, gup_flags); ret = -EFAULT; goto out; } } for (j = 0; j < page_increm; j++) { subpage = nth_page(page, j); pages[i + j] = subpage; flush_anon_page(vma, subpage, start + j * PAGE_SIZE); flush_dcache_page(subpage); } } i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; } while (nr_pages); out: if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return i ? i : ret; } static bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) { bool write = !!(fault_flags & FAULT_FLAG_WRITE); bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; if (!(vm_flags & vma->vm_flags)) return false; /* * The architecture might have a hardware protection * mechanism other than read/write that can deny access. * * gup always represents data access, not instruction * fetches, so execute=false here: */ if (!arch_vma_access_permitted(vma, write, false, foreign)) return false; return true; } /** * fixup_user_fault() - manually resolve a user page fault * @mm: mm_struct of target mm * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller * does not allow retry. If NULL, the caller must guarantee * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY. * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() * section), this returns -EFAULT, and we want to resolve the user fault before * trying again. * * Typically this is meant to be used by the futex code. * * The main difference with get_user_pages() is that this function will * unconditionally call handle_mm_fault() which will in turn perform all the * necessary SW fixup of the dirty and young bits in the PTE, while * get_user_pages() only guarantees to update these in the struct page. * * This is important for some architectures where those bits also gate the * access permission to the page because they are maintained in software. On * such architectures, gup() will not be enough to make a subsequent access * succeed. * * This function will not return with an unlocked mmap_lock. So it has not the * same semantics wrt the @mm->mmap_lock as does filemap_fault(). */ int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { struct vm_area_struct *vma; vm_fault_t ret; address = untagged_addr_remote(mm, address); if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; retry: vma = gup_vma_lookup(mm, address); if (!vma) return -EFAULT; if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; if ((fault_flags & FAULT_FLAG_KILLABLE) && fatal_signal_pending(current)) return -EINTR; ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_COMPLETED) { /* * NOTE: it's a pity that we need to retake the lock here * to pair with the unlock() in the callers. Ideally we * could tell the callers so they do not need to unlock. */ mmap_read_lock(mm); *unlocked = true; return 0; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); *unlocked = true; fault_flags |= FAULT_FLAG_TRIED; goto retry; } return 0; } EXPORT_SYMBOL_GPL(fixup_user_fault); /* * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is * specified, it'll also respond to generic signals. The caller of GUP * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption. */ static bool gup_signal_pending(unsigned int flags) { if (fatal_signal_pending(current)) return true; if (!(flags & FOLL_INTERRUPTIBLE)) return false; return signal_pending(current); } /* * Locking: (*locked == 1) means that the mmap_lock has already been acquired by * the caller. This function may drop the mmap_lock. If it does so, then it will * set (*locked = 0). * * (*locked == 0) means that the caller expects this function to acquire and * drop the mmap_lock. Therefore, the value of *locked will still be zero when * the function returns, even though it may have changed temporarily during * function execution. * * Please note that this function, unlike __get_user_pages(), will not return 0 * for nr_pages > 0, unless FOLL_NOWAIT is used. */ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int flags) { long ret, pages_done; bool must_unlock = false; if (!nr_pages) return 0; /* * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. */ if (!*locked) { if (mmap_read_lock_killable(mm)) return -EAGAIN; must_unlock = true; *locked = 1; } else mmap_assert_locked(mm); if (flags & FOLL_PIN) mm_set_has_pinned_flag(&mm->flags); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior * is to set FOLL_GET if the caller wants pages[] filled in (but has * carelessly failed to specify FOLL_GET), so keep doing that, but only * for FOLL_GET, not for the newer FOLL_PIN. * * FOLL_PIN always expects pages to be non-null, but no need to assert * that here, as any failures will be obvious enough. */ if (pages && !(flags & FOLL_PIN)) flags |= FOLL_GET; pages_done = 0; for (;;) { ret = __get_user_pages(mm, start, nr_pages, flags, pages, locked); if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ pages_done = ret; break; } /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */ VM_WARN_ON_ONCE(!*locked && (ret < 0 || ret >= nr_pages)); if (ret > 0) { nr_pages -= ret; pages_done += ret; if (!nr_pages) break; } if (*locked) { /* * VM_FAULT_RETRY didn't trigger or it was a * FOLL_NOWAIT. */ if (!pages_done) pages_done = ret; break; } /* * VM_FAULT_RETRY triggered, so seek to the faulting offset. * For the prefault case (!pages) we only update counts. */ if (likely(pages)) pages += ret; start += ret << PAGE_SHIFT; /* The lock was temporarily dropped, so we must unlock later */ must_unlock = true; retry: /* * Repeat on the address that fired VM_FAULT_RETRY * with both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_TRIED. Note that GUP can be interrupted * by fatal signals of even common signals, depending on * the caller's request. So we need to check it before we * start trying again otherwise it can loop forever. */ if (gup_signal_pending(flags)) { if (!pages_done) pages_done = -EINTR; break; } ret = mmap_read_lock_killable(mm); if (ret) { if (!pages_done) pages_done = ret; break; } *locked = 1; ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED, pages, locked); if (!*locked) { /* Continue to retry until we succeeded */ VM_WARN_ON_ONCE(ret != 0); goto retry; } if (ret != 1) { VM_WARN_ON_ONCE(ret > 1); if (!pages_done) pages_done = ret; break; } nr_pages--; pages_done++; if (!nr_pages) break; if (likely(pages)) pages++; start += PAGE_SIZE; } if (must_unlock && *locked) { /* * We either temporarily dropped the lock, or the caller * requested that we both acquire and drop the lock. Either way, * we must now unlock, and notify the caller of that state. */ mmap_read_unlock(mm); *locked = 0; } /* * Failing to pin anything implies something has gone wrong (except when * FOLL_NOWAIT is specified). */ if (WARN_ON_ONCE(pages_done == 0 && !(flags & FOLL_NOWAIT))) return -EFAULT; return pages_done; } /** * populate_vma_page_range() - populate a range of pages in the vma. * @vma: target vma * @start: start address * @end: end address * @locked: whether the mmap_lock is still held * * This takes care of mlocking the pages too if VM_LOCKED is set. * * Return either number of pages pinned in the vma, or a negative error * code on error. * * vma->vm_mm->mmap_lock must be held. * * If @locked is NULL, it may be held for read or write and will * be unperturbed. * * If @locked is non-NULL, it must held for read only and may be * released. If it's released, *@locked will be set to 0. */ long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked) { struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int local_locked = 1; int gup_flags; long ret; VM_WARN_ON_ONCE(!PAGE_ALIGNED(start)); VM_WARN_ON_ONCE(!PAGE_ALIGNED(end)); VM_WARN_ON_ONCE_VMA(start < vma->vm_start, vma); VM_WARN_ON_ONCE_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* * Rightly or wrongly, the VM_LOCKONFAULT case has never used * faultin_page() to break COW, so it has no work to do here. */ if (vma->vm_flags & VM_LOCKONFAULT) return nr_pages; /* ... similarly, we've never faulted in PROT_NONE pages */ if (!vma_is_accessible(vma)) return -EFAULT; gup_flags = FOLL_TOUCH; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW * and we would not want to dirty them for nothing. * * Otherwise, do a read fault, and use FOLL_FORCE in case it's not * readable (ie write-only or executable). */ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; else gup_flags |= FOLL_FORCE; if (locked) gup_flags |= FOLL_UNLOCKABLE; /* * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ ret = __get_user_pages(mm, start, nr_pages, gup_flags, NULL, locked ? locked : &local_locked); lru_add_drain(); return ret; } /* * faultin_page_range() - populate (prefault) page tables inside the * given range readable/writable * * This takes care of mlocking the pages, too, if VM_LOCKED is set. * * @mm: the mm to populate page tables in * @start: start address * @end: end address * @write: whether to prefault readable or writable * @locked: whether the mmap_lock is still held * * Returns either number of processed pages in the MM, or a negative error * code on error (see __get_user_pages()). Note that this function reports * errors related to VMAs, such as incompatible mappings, as expected by * MADV_POPULATE_(READ|WRITE). * * The range must be page-aligned. * * mm->mmap_lock must be held. If it's released, *@locked will be set to 0. */ long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked) { unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; long ret; VM_WARN_ON_ONCE(!PAGE_ALIGNED(start)); VM_WARN_ON_ONCE(!PAGE_ALIGNED(end)); mmap_assert_locked(mm); /* * FOLL_TOUCH: Mark page accessed and thereby young; will also mark * the page dirty with FOLL_WRITE -- which doesn't make a * difference with !FOLL_FORCE, because the page is writable * in the page table. * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit * a poisoned page. * !FOLL_FORCE: Require proper access permissions. */ gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE | FOLL_MADV_POPULATE; if (write) gup_flags |= FOLL_WRITE; ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked, gup_flags); lru_add_drain(); return ret; } /* * __mm_populate - populate and/or mlock pages within a range of address space. * * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap * flags. VMAs must be already marked with the desired vm_flags, and * mmap_lock must not be held. */ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; long ret = 0; end = start + len; for (nstart = start; nstart < end; nstart = nend) { /* * We want to fault in pages for [nstart; end) address range. * Find first corresponding VMA. */ if (!locked) { locked = 1; mmap_read_lock(mm); vma = find_vma_intersection(mm, nstart, end); } else if (nstart >= vma->vm_end) vma = find_vma_intersection(mm, vma->vm_end, end); if (!vma) break; /* * Set [nstart; nend) to intersection of desired address * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); if (vma->vm_flags & (VM_IO | VM_PFNMAP)) continue; if (nstart < vma->vm_start) nstart = vma->vm_start; /* * Now fault in a range of pages. populate_vma_page_range() * double checks the vma flags, so that it won't mlock pages * if the vma was already munlocked. */ ret = populate_vma_page_range(vma, nstart, nend, &locked); if (ret < 0) { if (ignore_errors) { ret = 0; continue; /* continue at next VMA */ } break; } nend = nstart + ret * PAGE_SIZE; ret = 0; } if (locked) mmap_read_unlock(mm); return ret; /* 0 or negative error code */ } #else /* CONFIG_MMU */ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int foll_flags) { struct vm_area_struct *vma; bool must_unlock = false; vm_flags_t vm_flags; long i; if (!nr_pages) return 0; /* * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. */ if (!*locked) { if (mmap_read_lock_killable(mm)) return -EAGAIN; must_unlock = true; *locked = 1; } /* calculate required read or write permissions. * If FOLL_FORCE is set, we only require the "MAY" flags. */ vm_flags = (foll_flags & FOLL_WRITE) ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (foll_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); for (i = 0; i < nr_pages; i++) { vma = find_vma(mm, start); if (!vma) break; /* protect what we can, including chardevs */ if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || !(vm_flags & vma->vm_flags)) break; if (pages) { pages[i] = virt_to_page((void *)start); if (pages[i]) get_page(pages[i]); } start = (start + PAGE_SIZE) & PAGE_MASK; } if (must_unlock && *locked) { mmap_read_unlock(mm); *locked = 0; } return i ? : -EFAULT; } #endif /* !CONFIG_MMU */ /** * fault_in_writeable - fault in userspace address range for writing * @uaddr: start of address range * @size: size of address range * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). */ size_t fault_in_writeable(char __user *uaddr, size_t size) { const unsigned long start = (unsigned long)uaddr; const unsigned long end = start + size; unsigned long cur; if (unlikely(size == 0)) return 0; if (!user_write_access_begin(uaddr, size)) return size; /* Stop once we overflow to 0. */ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) unsafe_put_user(0, (char __user *)cur, out); out: user_write_access_end(); if (size > cur - start) return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_writeable); /** * fault_in_subpage_writeable - fault in an address range for writing * @uaddr: start of address range * @size: size of address range * * Fault in a user address range for writing while checking for permissions at * sub-page granularity (e.g. arm64 MTE). This function should be used when * the caller cannot guarantee forward progress of a copy_to_user() loop. * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). */ size_t fault_in_subpage_writeable(char __user *uaddr, size_t size) { size_t faulted_in; /* * Attempt faulting in at page granularity first for page table * permission checking. The arch-specific probe_subpage_writeable() * functions may not check for this. */ faulted_in = size - fault_in_writeable(uaddr, size); if (faulted_in) faulted_in -= probe_subpage_writeable(uaddr, faulted_in); return size - faulted_in; } EXPORT_SYMBOL(fault_in_subpage_writeable); /* * fault_in_safe_writeable - fault in an address range for writing * @uaddr: start of address range * @size: length of address range * * Faults in an address range for writing. This is primarily useful when we * already know that some or all of the pages in the address range aren't in * memory. * * Unlike fault_in_writeable(), this function is non-destructive. * * Note that we don't pin or otherwise hold the pages referenced that we fault * in. There's no guarantee that they'll stay in memory for any duration of * time. * * Returns the number of bytes not faulted in, like copy_to_user() and * copy_from_user(). */ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) { const unsigned long start = (unsigned long)uaddr; const unsigned long end = start + size; unsigned long cur; struct mm_struct *mm = current->mm; bool unlocked = false; if (unlikely(size == 0)) return 0; mmap_read_lock(mm); /* Stop once we overflow to 0. */ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked)) break; mmap_read_unlock(mm); if (size > cur - start) return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_safe_writeable); /** * fault_in_readable - fault in userspace address range for reading * @uaddr: start of user address range * @size: size of user address range * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). */ size_t fault_in_readable(const char __user *uaddr, size_t size) { const unsigned long start = (unsigned long)uaddr; const unsigned long end = start + size; unsigned long cur; volatile char c; if (unlikely(size == 0)) return 0; if (!user_read_access_begin(uaddr, size)) return size; /* Stop once we overflow to 0. */ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) unsafe_get_user(c, (const char __user *)cur, out); out: user_read_access_end(); (void)c; if (size > cur - start) return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_readable); /** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address * @locked: a pointer to an int denoting whether the mmap sem is held * * Returns struct page pointer of user page pinned for dump, * to be freed afterwards by put_page(). * * Returns NULL on any kind of failure - a hole must then be inserted into * the corefile, to preserve alignment with its headers; and also returns * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - * allowing a hole to be left in the corefile to save disk space. * * Called without mmap_lock (takes and releases the mmap_lock by itself). */ #ifdef CONFIG_ELF_CORE struct page *get_dump_page(unsigned long addr, int *locked) { struct page *page; int ret; ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */ #ifdef CONFIG_MIGRATION /* * An array of either pages or folios ("pofs"). Although it may seem tempting to * avoid this complication, by simply interpreting a list of folios as a list of * pages, that approach won't work in the longer term, because eventually the * layouts of struct page and struct folio will become completely different. * Furthermore, this pof approach avoids excessive page_folio() calls. */ struct pages_or_folios { union { struct page **pages; struct folio **folios; void **entries; }; bool has_folios; long nr_entries; }; static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i) { if (pofs->has_folios) return pofs->folios[i]; return page_folio(pofs->pages[i]); } static void pofs_clear_entry(struct pages_or_folios *pofs, long i) { pofs->entries[i] = NULL; } static void pofs_unpin(struct pages_or_folios *pofs) { if (pofs->has_folios) unpin_folios(pofs->folios, pofs->nr_entries); else unpin_user_pages(pofs->pages, pofs->nr_entries); } static struct folio *pofs_next_folio(struct folio *folio, struct pages_or_folios *pofs, long *index_ptr) { long i = *index_ptr + 1; if (!pofs->has_folios && folio_test_large(folio)) { const unsigned long start_pfn = folio_pfn(folio); const unsigned long end_pfn = start_pfn + folio_nr_pages(folio); for (; i < pofs->nr_entries; i++) { unsigned long pfn = page_to_pfn(pofs->pages[i]); /* Is this page part of this folio? */ if (pfn < start_pfn || pfn >= end_pfn) break; } } if (unlikely(i == pofs->nr_entries)) return NULL; *index_ptr = i; return pofs_get_folio(pofs, i); } /* * Returns the number of collected folios. Return value is always >= 0. */ static unsigned long collect_longterm_unpinnable_folios( struct list_head *movable_folio_list, struct pages_or_folios *pofs) { unsigned long collected = 0; struct folio *folio; int drained = 0; long i = 0; for (folio = pofs_get_folio(pofs, i); folio; folio = pofs_next_folio(folio, pofs, &i)) { if (folio_is_longterm_pinnable(folio)) continue; collected++; if (folio_is_device_coherent(folio)) continue; if (folio_test_hugetlb(folio)) { folio_isolate_hugetlb(folio, movable_folio_list); continue; } if (drained == 0 && folio_may_be_lru_cached(folio) && folio_ref_count(folio) != folio_expected_ref_count(folio) + 1) { lru_add_drain(); drained = 1; } if (drained == 1 && folio_may_be_lru_cached(folio) && folio_ref_count(folio) != folio_expected_ref_count(folio) + 1) { lru_add_drain_all(); drained = 2; } if (!folio_isolate_lru(folio)) continue; list_add_tail(&folio->lru, movable_folio_list); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } return collected; } /* * Unpins all folios and migrates device coherent folios and movable_folio_list. * Returns -EAGAIN if all folios were successfully migrated or -errno for * failure (or partial success). */ static int migrate_longterm_unpinnable_folios(struct list_head *movable_folio_list, struct pages_or_folios *pofs) { int ret; unsigned long i; for (i = 0; i < pofs->nr_entries; i++) { struct folio *folio = pofs_get_folio(pofs, i); if (folio_is_device_coherent(folio)) { /* * Migration will fail if the folio is pinned, so * convert the pin on the source folio to a normal * reference. */ pofs_clear_entry(pofs, i); folio_get(folio); gup_put_folio(folio, 1, FOLL_PIN); if (migrate_device_coherent_folio(folio)) { ret = -EBUSY; goto err; } continue; } /* * We can't migrate folios with unexpected references, so drop * the reference obtained by __get_user_pages_locked(). * Migrating folios have been added to movable_folio_list after * calling folio_isolate_lru() which takes a reference so the * folio won't be freed if it's migrating. */ unpin_folio(folio); pofs_clear_entry(pofs, i); } if (!list_empty(movable_folio_list)) { struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, .reason = MR_LONGTERM_PIN, }; if (migrate_pages(movable_folio_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_LONGTERM_PIN, NULL)) { ret = -ENOMEM; goto err; } } putback_movable_pages(movable_folio_list); return -EAGAIN; err: pofs_unpin(pofs); putback_movable_pages(movable_folio_list); return ret; } static long check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs) { LIST_HEAD(movable_folio_list); unsigned long collected; collected = collect_longterm_unpinnable_folios(&movable_folio_list, pofs); if (!collected) return 0; return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs); } /* * Check whether all folios are *allowed* to be pinned indefinitely (long term). * Rather confusingly, all folios in the range are required to be pinned via * FOLL_PIN, before calling this routine. * * Return values: * * 0: if everything is OK and all folios in the range are allowed to be pinned, * then this routine leaves all folios pinned and returns zero for success. * * -EAGAIN: if any folios in the range are not allowed to be pinned, then this * routine will migrate those folios away, unpin all the folios in the range. If * migration of the entire set of folios succeeds, then -EAGAIN is returned. The * caller should re-pin the entire range with FOLL_PIN and then call this * routine again. * * -ENOMEM, or any other -errno: if an error *other* than -EAGAIN occurs, this * indicates a migration failure. The caller should give up, and propagate the * error back up the call stack. The caller does not need to unpin any folios in * that case, because this routine will do the unpinning. */ static long check_and_migrate_movable_folios(unsigned long nr_folios, struct folio **folios) { struct pages_or_folios pofs = { .folios = folios, .has_folios = true, .nr_entries = nr_folios, }; return check_and_migrate_movable_pages_or_folios(&pofs); } /* * Return values and behavior are the same as those for * check_and_migrate_movable_folios(). */ static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages) { struct pages_or_folios pofs = { .pages = pages, .has_folios = false, .nr_entries = nr_pages, }; return check_and_migrate_movable_pages_or_folios(&pofs); } #else static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages) { return 0; } static long check_and_migrate_movable_folios(unsigned long nr_folios, struct folio **folios) { return 0; } #endif /* CONFIG_MIGRATION */ /* * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which * allows us to process the FOLL_LONGTERM flag. */ static long __gup_longterm_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int gup_flags) { unsigned int flags; long rc, nr_pinned_pages; if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, locked, gup_flags); flags = memalloc_pin_save(); do { nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, pages, locked, gup_flags); if (nr_pinned_pages <= 0) { rc = nr_pinned_pages; break; } /* FOLL_LONGTERM implies FOLL_PIN */ rc = check_and_migrate_movable_pages(nr_pinned_pages, pages); } while (rc == -EAGAIN); memalloc_pin_restore(flags); return rc ? rc : nr_pinned_pages; } /* * Check that the given flags are valid for the exported gup/pup interface, and * update them with the required flags that the caller must have set. */ static bool is_valid_gup_args(struct page **pages, int *locked, unsigned int *gup_flags_p, unsigned int to_set) { unsigned int gup_flags = *gup_flags_p; /* * These flags not allowed to be specified externally to the gup * interfaces: * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only * - FOLL_REMOTE is internal only, set in (get|pin)_user_pages_remote() * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL */ if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS)) return false; gup_flags |= to_set; if (locked) { /* At the external interface locked must be set */ if (WARN_ON_ONCE(*locked != 1)) return false; gup_flags |= FOLL_UNLOCKABLE; } /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return false; /* LONGTERM can only be specified when pinning */ if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM))) return false; /* Pages input must be given if using GET/PIN */ if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages)) return false; /* We want to allow the pgmap to be hot-unplugged at all times */ if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) && (gup_flags & FOLL_PCI_P2PDMA))) return false; *gup_flags_p = gup_flags; return true; } #ifdef CONFIG_MMU /** * get_user_pages_remote() - pin user pages in memory * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * * Returns either number of pages pinned (which may be less than the * number requested), or an error. Details about the return value: * * -- If nr_pages is 0, returns 0. * -- If nr_pages is >0, but no pages were pinned, returns -errno. * -- If nr_pages is >0, and some pages were pinned, returns the number of * pages pinned. Again, this may be less than nr_pages. * * The caller is responsible for releasing returned @pages, via put_page(). * * Must be called with mmap_lock held for read or write. * * get_user_pages_remote walks a process's page tables and takes a reference * to each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when * get_user_pages_remote returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re-faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must * be called after the page is finished with, and before put_page is called. * * get_user_pages_remote is typically used for fewer-copy IO operations, * to get a handle on the memory by some means other than accesses * via the user virtual addresses. The pages may be submitted for * DMA to devices or accessed via their kernel linear mapping (via the * kmap APIs). Care should be taken to use the correct cache flushing APIs. * * See also get_user_pages_fast, for performance critical applications. * * get_user_pages_remote should be phased out in favor of * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing * should use get_user_pages_remote because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { int local_locked = 1; if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_TOUCH | FOLL_REMOTE)) return -EINVAL; return __get_user_pages_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } EXPORT_SYMBOL(get_user_pages_remote); #else /* CONFIG_MMU */ long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { return 0; } #endif /* !CONFIG_MMU */ /** * get_user_pages() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * * This is the same as get_user_pages_remote(), just with a less-flexible * calling convention where we assume that the mm being operated on belongs to * the current task, and doesn't allow passing of a locked parameter. We also * obviously don't pass FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { int locked = 1; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages); /* * get_user_pages_unlocked() is suitable to replace the form: * * mmap_read_lock(mm); * get_user_pages(mm, ..., pages, NULL); * mmap_read_unlock(mm); * * with: * * get_user_pages_unlocked(mm, ..., pages); * * It is functionally equivalent to get_user_pages_fast so * get_user_pages_fast should be used instead if specific gup_flags * (e.g. FOLL_FORCE) are not required. */ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { int locked = 0; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH | FOLL_UNLOCKABLE)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages_unlocked); /* * GUP-fast * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be * protected from page table pages being freed from under it, and should * block any THP splits. * * One way to achieve this is to have the walker disable interrupts, and * rely on IPIs from the TLB flushing code blocking before the page table * pages are freed. This is unsuitable for architectures that do not need * to broadcast an IPI when invalidating TLBs. * * Another way to achieve this is to batch up page table containing pages * belonging to more than one mm_user, then rcu_sched a callback to free those * pages. Disabling interrupts will allow the gup_fast() walker to both block * the rcu_sched callback, and an IPI that we broadcast for splitting THPs * (which is a relatively rare event). The code below adopts this strategy. * * Before activating this code, please be aware that the following assumptions * are currently made: * * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to * free pages containing page tables or TLB flushing requires IPI broadcast. * * *) ptes can be read atomically by the architecture. * * *) valid user addesses are below TASK_MAX_SIZE * * The last two assumptions can be relaxed by the addition of helper functions. * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ #ifdef CONFIG_HAVE_GUP_FAST /* * Used in the GUP-fast path to determine whether GUP is permitted to work on * a specific folio. * * This call assumes the caller has pinned the folio, that the lowest page table * level still points to this folio, and that interrupts have been disabled. * * GUP-fast must reject all secretmem folios. * * Writing to pinned file-backed dirty tracked folios is inherently problematic * (see comment describing the writable_file_mapping_allowed() function). We * therefore try to avoid the most egregious case of a long-term mapping doing * so. * * This function cannot be as thorough as that one as the VMA is not available * in the fast path, so instead we whitelist known good cases and if in doubt, * fall back to the slow path. */ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) { bool reject_file_backed = false; struct address_space *mapping; bool check_secretmem = false; unsigned long mapping_flags; /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the one we disallow. */ if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) == (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) reject_file_backed = true; /* We hold a folio reference, so we can safely access folio fields. */ /* secretmem folios are always order-0 folios. */ if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio)) check_secretmem = true; if (!reject_file_backed && !check_secretmem) return true; if (WARN_ON_ONCE(folio_test_slab(folio))) return false; /* hugetlb neither requires dirty-tracking nor can be secretmem. */ if (folio_test_hugetlb(folio)) return true; /* * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods * cannot proceed, which means no actions performed under RCU can * proceed either. * * inodes and thus their mappings are freed under RCU, which means the * mapping cannot be freed beneath us and thus we can safely dereference * it. */ lockdep_assert_irqs_disabled(); /* * However, there may be operations which _alter_ the mapping, so ensure * we read it once and only once. */ mapping = READ_ONCE(folio->mapping); /* * The mapping may have been truncated, in any case we cannot determine * if this mapping is safe - fall back to slow path to determine how to * proceed. */ if (!mapping) return false; /* Anonymous folios pose no problem. */ mapping_flags = (unsigned long)mapping & FOLIO_MAPPING_FLAGS; if (mapping_flags) return mapping_flags & FOLIO_MAPPING_ANON; /* * At this point, we know the mapping is non-null and points to an * address_space object. */ if (check_secretmem && secretmem_mapping(mapping)) return false; /* The only remaining allowed file system is shmem. */ return !reject_file_backed || shmem_mapping(mapping); } static void __maybe_unused gup_fast_undo_dev_pagemap(int *nr, int nr_start, unsigned int flags, struct page **pages) { while ((*nr) - nr_start) { struct folio *folio = page_folio(pages[--(*nr)]); folio_clear_referenced(folio); gup_put_folio(folio, 1, flags); } } #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL /* * GUP-fast relies on pte change detection to avoid concurrent pgtable * operations. * * To pin the page, GUP-fast needs to do below in order: * (1) pin the page (by prefetching pte), then (2) check pte not changed. * * For the rest of pgtable operations where pgtable updates can be racy * with GUP-fast, we need to do (1) clear pte, then (2) check whether page * is pinned. * * Above will work for all pte-level operations, including THP split. * * For THP collapse, it's a bit more complicated because GUP-fast may be * walking a pgtable page that is being freed (pte is still valid but pmd * can be cleared already). To avoid race in such condition, we need to * also check pmd here to make sure pmd doesn't change (corresponds to * pmdp_collapse_flush() in the THP collapse code path). */ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; int ret = 0; pte_t *ptep, *ptem; ptem = ptep = pte_offset_map(&pmd, addr); if (!ptep) return 0; do { pte_t pte = ptep_get_lockless(ptep); struct page *page; struct folio *folio; /* * Always fallback to ordinary GUP on PROT_NONE-mapped pages: * pte_access_permitted() better should reject these pages * either way: otherwise, GUP-fast might succeed in * cases where ordinary GUP would fail due to VMA access * permissions. */ if (pte_protnone(pte)) goto pte_unmap; if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto pte_unmap; if (pte_special(pte)) goto pte_unmap; /* If it's not marked as special it must have a valid memmap. */ VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); folio = try_grab_folio_fast(page, 1, flags); if (!folio) goto pte_unmap; if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (!gup_fast_folio_allowed(folio, flags)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } /* * We need to make the page accessible if and only if we are * going to access its content (the FOLL_PIN case). Please * see Documentation/core-api/pin_user_pages.rst for * details. */ if (flags & FOLL_PIN) { ret = arch_make_folio_accessible(folio); if (ret) { gup_put_folio(folio, 1, flags); goto pte_unmap; } } folio_set_referenced(folio); pages[*nr] = page; (*nr)++; } while (ptep++, addr += PAGE_SIZE, addr != end); ret = 1; pte_unmap: if (pgmap) put_dev_pagemap(pgmap); pte_unmap(ptem); return ret; } #else /* * If we can't determine whether or not a pte is special, then fail immediately * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not * to be special. * * For a futex to be placed on a THP tail page, get_futex_key requires a * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_fast_pmd_leaf even if we can't operate on ptes. */ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *page; struct folio *folio; int refs; if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pmd_special(orig)) return 0; page = pmd_page(orig); refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr); folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { gup_put_folio(folio, refs, flags); return 0; } if (!gup_fast_folio_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *page; struct folio *folio; int refs; if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pud_special(orig)) return 0; page = pud_page(orig); refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr); folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { gup_put_folio(folio, refs, flags); return 0; } if (!gup_fast_folio_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; pmdp = pmd_offset_lockless(pudp, pud, addr); do { pmd_t pmd = pmdp_get_lockless(pmdp); next = pmd_addr_end(addr, end); if (!pmd_present(pmd)) return 0; if (unlikely(pmd_leaf(pmd))) { /* See gup_fast_pte_range() */ if (pmd_protnone(pmd)) return 0; if (!gup_fast_pmd_leaf(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); return 1; } static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; pudp = pud_offset_lockless(p4dp, p4d, addr); do { pud_t pud = READ_ONCE(*pudp); next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) return 0; if (unlikely(pud_leaf(pud))) { if (!gup_fast_pud_leaf(pud, pudp, addr, next, flags, pages, nr)) return 0; } else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); return 1; } static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp; p4dp = p4d_offset_lockless(pgdp, pgd, addr); do { p4d_t p4d = READ_ONCE(*p4dp); next = p4d_addr_end(addr, end); if (!p4d_present(p4d)) return 0; BUILD_BUG_ON(p4d_leaf(p4d)); if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); return 1; } static void gup_fast_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pgd_t *pgdp; pgdp = pgd_offset(current->mm, addr); do { pgd_t pgd = READ_ONCE(*pgdp); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) return; BUILD_BUG_ON(pgd_leaf(pgd)); if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) return; } while (pgdp++, addr = next, addr != end); } #else static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { } #endif /* CONFIG_HAVE_GUP_FAST */ #ifndef gup_fast_permitted /* * Check if it's allowed to use get_user_pages_fast_only() for the range, or * we need to fall back to the slow version: */ static bool gup_fast_permitted(unsigned long start, unsigned long end) { return true; } #endif static unsigned long gup_fast(unsigned long start, unsigned long end, unsigned int gup_flags, struct page **pages) { unsigned long flags; int nr_pinned = 0; unsigned seq; if (!IS_ENABLED(CONFIG_HAVE_GUP_FAST) || !gup_fast_permitted(start, end)) return 0; if (gup_flags & FOLL_PIN) { if (!raw_seqcount_try_begin(&current->mm->write_protect_seq, seq)) return 0; } /* * Disable interrupts. The nested form is used, in order to allow full, * general purpose use of this routine. * * With interrupts disabled, we block page table pages from being freed * from under us. See struct mmu_table_batch comments in * include/asm-generic/tlb.h for more details. * * We do not adopt an rcu_read_lock() here as we also want to block IPIs * that come from callers of tlb_remove_table_sync_one(). */ local_irq_save(flags); gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned); local_irq_restore(flags); /* * When pinning pages for DMA there could be a concurrent write protect * from fork() via copy_page_range(), in this case always fail GUP-fast. */ if (gup_flags & FOLL_PIN) { if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) { gup_fast_unpin_user_pages(pages, nr_pinned); return 0; } else { sanity_check_pinned_pages(pages, nr_pinned); } } return nr_pinned; } static int gup_fast_fallback(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { unsigned long len, end; unsigned long nr_pinned; int locked = 0; int ret; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | FOLL_FAST_ONLY | FOLL_NOFAULT | FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT))) return -EINVAL; if (gup_flags & FOLL_PIN) mm_set_has_pinned_flag(&current->mm->flags); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(&current->mm->mmap_lock); start = untagged_addr(start) & PAGE_MASK; len = nr_pages << PAGE_SHIFT; if (check_add_overflow(start, len, &end)) return -EOVERFLOW; if (end > TASK_SIZE_MAX) return -EFAULT; nr_pinned = gup_fast(start, end, gup_flags, pages); if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) return nr_pinned; /* Slow path: try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned, pages, &locked, gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so * returning -errno is not an option */ if (nr_pinned) return nr_pinned; return ret; } return ret + nr_pinned; } /** * get_user_pages_fast_only() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. * * If the architecture does not support this function, simply return with no * pages pinned. * * Careful, careful! COW breaking can go either way, so a non-write * access can get ambiguous page results. If you call this function without * 'write' set, you'd better be sure that you're ok with that ambiguity. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { /* * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, * because gup fast is always a "pin with a +1 page refcount" request. * * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET | FOLL_FAST_ONLY)) return -EINVAL; return gup_fast_fallback(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast_only); /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Attempt to pin user pages in memory without taking mm->mmap_lock. * If not successful, it will fall back to taking the lock and * calling get_user_pages(). * * Returns number of pages pinned. This may be fewer than the number requested. * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns * -errno. */ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { /* * The caller may or may not have explicitly set FOLL_GET; either way is * OK. However, internally (within mm/gup.c), gup fast variants must set * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" * request. */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET)) return -EINVAL; return gup_fast_fallback(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); /** * pin_user_pages_fast() - pin user pages in memory without taking locks * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See * get_user_pages_fast() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for further details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page() will not remove pins from it. */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return -EINVAL; return gup_fast_fallback(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(pin_user_pages_fast); /** * pin_user_pages_remote() - pin pages of a remote process * * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See * get_user_pages_remote() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { int local_locked = 1; if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) return 0; return __gup_longterm_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_remote); /** * pin_user_pages() - pin user pages in memory for use by other devices * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { int locked = 1; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages); /* * pin_user_pages_unlocked() is the FOLL_PIN variant of * get_user_pages_unlocked(). Behavior is the same, except that this one sets * FOLL_PIN and rejects FOLL_GET. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { int locked = 0; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); /** * memfd_pin_folios() - pin folios associated with a memfd * @memfd: the memfd whose folios are to be pinned * @start: the first memfd offset * @end: the last memfd offset (inclusive) * @folios: array that receives pointers to the folios pinned * @max_folios: maximum number of entries in @folios * @offset: the offset into the first folio * * Attempt to pin folios associated with a memfd in the contiguous range * [start, end]. Given that a memfd is either backed by shmem or hugetlb, * the folios can either be found in the page cache or need to be allocated * if necessary. Once the folios are located, they are all pinned via * FOLL_PIN and @offset is populatedwith the offset into the first folio. * And, eventually, these pinned folios must be released either using * unpin_folios() or unpin_folio(). * * It must be noted that the folios may be pinned for an indefinite amount * of time. And, in most cases, the duration of time they may stay pinned * would be controlled by the userspace. This behavior is effectively the * same as using FOLL_LONGTERM with other GUP APIs. * * Returns number of folios pinned, which could be less than @max_folios * as it depends on the folio sizes that cover the range [start, end]. * If no folios were pinned, it returns -errno. */ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, struct folio **folios, unsigned int max_folios, pgoff_t *offset) { unsigned int flags, nr_folios, nr_found; unsigned int i, pgshift = PAGE_SHIFT; pgoff_t start_idx, end_idx; struct folio *folio = NULL; struct folio_batch fbatch; struct hstate *h; long ret = -EINVAL; if (start < 0 || start > end || !max_folios) return -EINVAL; if (!memfd) return -EINVAL; if (!shmem_file(memfd) && !is_file_hugepages(memfd)) return -EINVAL; if (end >= i_size_read(file_inode(memfd))) return -EINVAL; if (is_file_hugepages(memfd)) { h = hstate_file(memfd); pgshift = huge_page_shift(h); } flags = memalloc_pin_save(); do { nr_folios = 0; start_idx = start >> pgshift; end_idx = end >> pgshift; if (is_file_hugepages(memfd)) { start_idx <<= huge_page_order(h); end_idx <<= huge_page_order(h); } folio_batch_init(&fbatch); while (start_idx <= end_idx && nr_folios < max_folios) { /* * In most cases, we should be able to find the folios * in the page cache. If we cannot find them for some * reason, we try to allocate them and add them to the * page cache. */ nr_found = filemap_get_folios_contig(memfd->f_mapping, &start_idx, end_idx, &fbatch); if (folio) { folio_put(folio); folio = NULL; } for (i = 0; i < nr_found; i++) { folio = fbatch.folios[i]; if (try_grab_folio(folio, 1, FOLL_PIN)) { folio_batch_release(&fbatch); ret = -EINVAL; goto err; } if (nr_folios == 0) *offset = offset_in_folio(folio, start); folios[nr_folios] = folio; if (++nr_folios == max_folios) break; } folio = NULL; folio_batch_release(&fbatch); if (!nr_found) { folio = memfd_alloc_folio(memfd, start_idx); if (IS_ERR(folio)) { ret = PTR_ERR(folio); if (ret != -EEXIST) goto err; folio = NULL; } } } ret = check_and_migrate_movable_folios(nr_folios, folios); } while (ret == -EAGAIN); memalloc_pin_restore(flags); return ret ? ret : nr_folios; err: memalloc_pin_restore(flags); unpin_folios(folios, nr_folios); return ret; } EXPORT_SYMBOL_GPL(memfd_pin_folios); /** * folio_add_pins() - add pins to an already-pinned folio * @folio: the folio to add more pins to * @pins: number of pins to add * * Try to add more pins to an already-pinned folio. The semantics * of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot * be changed. * * This function is helpful when having obtained a pin on a large folio * using memfd_pin_folios(), but wanting to logically unpin parts * (e.g., individual pages) of the folio later, for example, using * unpin_user_page_range_dirty_lock(). * * This is not the right interface to initially pin a folio. */ int folio_add_pins(struct folio *folio, unsigned int pins) { VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio)); return try_grab_folio(folio, pins, FOLL_PIN); } EXPORT_SYMBOL_GPL(folio_add_pins);
192 10934 60 46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 /* * Performance events: * * Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * * Data type definitions, declarations, prototypes. * * Started by: Thomas Gleixner and Ingo Molnar * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_PERF_EVENT_H #define _LINUX_PERF_EVENT_H #include <uapi/linux/perf_event.h> #include <uapi/linux/bpf_perf_event.h> /* * Kernel-internal data types and definitions: */ #ifdef CONFIG_PERF_EVENTS # include <asm/perf_event.h> # include <asm/local64.h> #endif #ifdef CONFIG_HAVE_HW_BREAKPOINT # include <linux/rhashtable-types.h> # include <asm/hw_breakpoint.h> #endif #include <linux/list.h> #include <linux/mutex.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/fs.h> #include <linux/pid_namespace.h> #include <linux/workqueue.h> #include <linux/ftrace.h> #include <linux/cpu.h> #include <linux/irq_work.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/atomic.h> #include <linux/sysfs.h> #include <linux/perf_regs.h> #include <linux/cgroup.h> #include <linux/refcount.h> #include <linux/security.h> #include <linux/static_call.h> #include <linux/lockdep.h> #include <asm/local.h> struct perf_callchain_entry { u64 nr; u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_callchain_entry_ctx { struct perf_callchain_entry *entry; u32 max_stack; u32 nr; short contexts; bool contexts_maxed; }; typedef unsigned long (*perf_copy_f)(void *dst, const void *src, unsigned long off, unsigned long len); struct perf_raw_frag { union { struct perf_raw_frag *next; unsigned long pad; }; perf_copy_f copy; void *data; u32 size; } __packed; struct perf_raw_record { struct perf_raw_frag frag; u32 size; }; static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag) { return frag->pad < sizeof(u64); } /* * branch stack layout: * nr: number of taken branches stored in entries[] * hw_idx: The low level index of raw branch records * for the most recent branch. * -1ULL means invalid/unknown. * * Note that nr can vary from sample to sample * branches (to, from) are stored from most recent * to least recent, i.e., entries[0] contains the most * recent branch. * The entries[] is an abstraction of raw branch records, * which may not be stored in age order in HW, e.g. Intel LBR. * The hw_idx is to expose the low level index of raw * branch record for the most recent branch aka entries[0]. * The hw_idx index is between -1 (unknown) and max depth, * which can be retrieved in /sys/devices/cpu/caps/branches. * For the architectures whose raw branch records are * already stored in age order, the hw_idx should be 0. */ struct perf_branch_stack { u64 nr; u64 hw_idx; struct perf_branch_entry entries[]; }; struct task_struct; /* * extra PMU register associated with an event */ struct hw_perf_event_extra { u64 config; /* register value */ unsigned int reg; /* register address or index */ int alloc; /* extra register already allocated */ int idx; /* index in shared_regs->regs[] */ }; /** * hw_perf_event::flag values * * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ #define PERF_EVENT_FLAG_ARCH 0x0fffffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); /** * struct hw_perf_event - performance event hardware details: */ struct hw_perf_event { #ifdef CONFIG_PERF_EVENTS union { struct { /* hardware */ u64 config; u64 config1; u64 last_tag; u64 dyn_constraint; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; int idx; int last_cpu; int flags; struct hw_perf_event_extra extra_reg; struct hw_perf_event_extra branch_reg; }; struct { /* aux / Intel-PT */ u64 aux_config; /* * For AUX area events, aux_paused cannot be a state * flag because it can be updated asynchronously to * state. */ unsigned int aux_paused; }; struct { /* software */ struct hrtimer hrtimer; }; struct { /* tracepoint */ /* for tp_event->class */ struct list_head tp_list; }; struct { /* amd_power */ u64 pwr_acc; u64 ptsc; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ /* * Crufty hack to avoid the chicken and egg * problem hw_breakpoint has with context * creation and event initalization. */ struct arch_hw_breakpoint info; struct rhlist_head bp_list; }; #endif struct { /* amd_iommu */ u8 iommu_bank; u8 iommu_cntr; u16 padding; u64 conf; u64 conf1; }; }; /* * If the event is a per task event, this will point to the task in * question. See the comment in perf_event_alloc(). */ struct task_struct *target; /* * PMU would store hardware filter configuration * here. */ void *addr_filters; /* Last sync'ed generation of filters */ unsigned long addr_filters_gen; /* * hw_perf_event::state flags; used to track the PERF_EF_* state. */ /* the counter is stopped */ #define PERF_HES_STOPPED 0x01 /* event->count up-to-date */ #define PERF_HES_UPTODATE 0x02 #define PERF_HES_ARCH 0x04 int state; /* * The last observed hardware counter value, updated with a * local64_cmpxchg() such that pmu::read() can be called nested. */ local64_t prev_count; /* * The period to start the next sample with. */ u64 sample_period; union { struct { /* Sampling */ /* * The period we started this sample with. */ u64 last_period; /* * However much is left of the current period; * note that this is a full 64bit value and * allows for generation of periods longer * than hardware might allow. */ local64_t period_left; }; struct { /* Topdown events counting for context switch */ u64 saved_metric; u64 saved_slots; }; }; /* * State for throttling the event, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 interrupts_seq; u64 interrupts; /* * State for freq target events, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 freq_time_stamp; u64 freq_count_stamp; #endif /* CONFIG_PERF_EVENTS */ }; struct perf_event; struct perf_event_pmu_context; /* * Common implementation detail of pmu::{start,commit,cancel}_txn */ /* txn to add/schedule event on PMU */ #define PERF_PMU_TXN_ADD 0x1 /* txn to read event group from PMU */ #define PERF_PMU_TXN_READ 0x2 /** * pmu::capabilities flags */ #define PERF_PMU_CAP_NO_INTERRUPT 0x0001 #define PERF_PMU_CAP_NO_NMI 0x0002 #define PERF_PMU_CAP_AUX_NO_SG 0x0004 #define PERF_PMU_CAP_EXTENDED_REGS 0x0008 #define PERF_PMU_CAP_EXCLUSIVE 0x0010 #define PERF_PMU_CAP_ITRACE 0x0020 #define PERF_PMU_CAP_NO_EXCLUDE 0x0040 #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 #define PERF_PMU_CAP_AUX_PAUSE 0x0200 #define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 /** * pmu::scope */ enum perf_pmu_scope { PERF_PMU_SCOPE_NONE = 0, PERF_PMU_SCOPE_CORE, PERF_PMU_SCOPE_DIE, PERF_PMU_SCOPE_CLUSTER, PERF_PMU_SCOPE_PKG, PERF_PMU_SCOPE_SYS_WIDE, PERF_PMU_MAX_SCOPE, }; struct perf_output_handle; #define PMU_NULL_DEV ((void *)(~0UL)) /** * struct pmu - generic performance monitoring unit */ struct pmu { struct list_head entry; spinlock_t events_lock; struct list_head events; struct module *module; struct device *dev; struct device *parent; const struct attribute_group **attr_groups; const struct attribute_group **attr_update; const char *name; int type; /* * various common per-pmu feature flags */ int capabilities; /* * PMU scope */ unsigned int scope; struct perf_cpu_pmu_context * __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; /* number of address filters this PMU can do */ unsigned int nr_addr_filters; /* * Fully disable/enable this PMU, can be used to protect from the PMI * as well as for lazy/batch writing of the MSRs. */ void (*pmu_enable) (struct pmu *pmu); /* optional */ void (*pmu_disable) (struct pmu *pmu); /* optional */ /* * Try and initialize the event for this PMU. * * Returns: * -ENOENT -- @event is not for this PMU * * -ENODEV -- @event is for this PMU but PMU not present * -EBUSY -- @event is for this PMU but PMU temporarily unavailable * -EINVAL -- @event is for this PMU but @event is not valid * -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported * -EACCES -- @event is for this PMU, @event is valid, but no privileges * * 0 -- @event is for this PMU and valid * * Other error return values are allowed. */ int (*event_init) (struct perf_event *event); /* * Notification that the event was mapped or unmapped. Called * in the context of the mapping task. */ void (*event_mapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ void (*event_unmapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ /* * Flags for ->add()/->del()/ ->start()/->stop(). There are * matching hw_perf_event::state flags. */ /* start the counter when adding */ #define PERF_EF_START 0x01 /* reload the counter when starting */ #define PERF_EF_RELOAD 0x02 /* update the counter when stopping */ #define PERF_EF_UPDATE 0x04 /* AUX area event, pause tracing */ #define PERF_EF_PAUSE 0x08 /* AUX area event, resume tracing */ #define PERF_EF_RESUME 0x10 /* * Adds/Removes a counter to/from the PMU, can be done inside a * transaction, see the ->*_txn() methods. * * The add/del callbacks will reserve all hardware resources required * to service the event, this includes any counter constraint * scheduling etc. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on. * * ->add() called without PERF_EF_START should result in the same state * as ->add() followed by ->stop(). * * ->del() must always PERF_EF_UPDATE stop an event. If it calls * ->stop() that must deal with already being stopped without * PERF_EF_UPDATE. */ int (*add) (struct perf_event *event, int flags); void (*del) (struct perf_event *event, int flags); /* * Starts/Stops a counter present on the PMU. * * The PMI handler should stop the counter when perf_event_overflow() * returns !0. ->start() will be used to continue. * * Also used to change the sample period. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on -- will be called from NMI context with the PMU generates * NMIs. * * ->stop() with PERF_EF_UPDATE will read the counter and update * period/count values like ->read() would. * * ->start() with PERF_EF_RELOAD will reprogram the counter * value, must be preceded by a ->stop() with PERF_EF_UPDATE. * * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with * PERF_EF_RESUME. * * ->start() with PERF_EF_RESUME will start as simply as possible but * only if the counter is not otherwise stopped. Will not overlap * another ->start() with PERF_EF_RESUME nor ->stop() with * PERF_EF_PAUSE. * * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other * ->stop()/->start() invocations, just not itself. */ void (*start) (struct perf_event *event, int flags); void (*stop) (struct perf_event *event, int flags); /* * Updates the counter value of the event. * * For sampling capable PMUs this will also update the software period * hw_perf_event::period_left field. */ void (*read) (struct perf_event *event); /* * Group events scheduling is treated as a transaction, add * group events as a whole and perform one schedulability test. * If the test fails, roll back the whole group * * Start the transaction, after this ->add() doesn't need to * do schedulability tests. * * Optional. */ void (*start_txn) (struct pmu *pmu, unsigned int txn_flags); /* * If ->start_txn() disabled the ->add() schedulability test * then ->commit_txn() is required to perform one. On success * the transaction is closed. On error the transaction is kept * open until ->cancel_txn() is called. * * Optional. */ int (*commit_txn) (struct pmu *pmu); /* * Will cancel the transaction, assumes ->del() is called * for each successful ->add() during the transaction. * * Optional. */ void (*cancel_txn) (struct pmu *pmu); /* * Will return the value for perf_event_mmap_page::index for this event, * if no implementation is provided it will default to 0 (see * perf_event_idx_default). */ int (*event_idx) (struct perf_event *event); /*optional */ /* * context-switches callback */ void (*sched_task) (struct perf_event_pmu_context *pmu_ctx, struct task_struct *task, bool sched_in); /* * Kmem cache of PMU specific data */ struct kmem_cache *task_ctx_cache; /* * Set up pmu-private data structures for an AUX area */ void *(*setup_aux) (struct perf_event *event, void **pages, int nr_pages, bool overwrite); /* optional */ /* * Free pmu-private AUX data structures */ void (*free_aux) (void *aux); /* optional */ /* * Take a snapshot of the AUX buffer without touching the event * state, so that preempting ->start()/->stop() callbacks does * not interfere with their logic. Called in PMI context. * * Returns the size of AUX data copied to the output handle. * * Optional. */ long (*snapshot_aux) (struct perf_event *event, struct perf_output_handle *handle, unsigned long size); /* * Validate address range filters: make sure the HW supports the * requested configuration and number of filters; return 0 if the * supplied filters are valid, -errno otherwise. * * Runs in the context of the ioctl()ing process and is not serialized * with the rest of the PMU callbacks. */ int (*addr_filters_validate) (struct list_head *filters); /* optional */ /* * Synchronize address range filter configuration: * translate hw-agnostic filters into hardware configuration in * event::hw::addr_filters. * * Runs as a part of filter sync sequence that is done in ->start() * callback by calling perf_event_addr_filters_sync(). * * May (and should) traverse event::addr_filters::list, for which its * caller provides necessary serialization. */ void (*addr_filters_sync) (struct perf_event *event); /* optional */ /* * Check if event can be used for aux_output purposes for * events of this PMU. * * Runs from perf_event_open(). Should return 0 for "no match" * or non-zero for "match". */ int (*aux_output_match) (struct perf_event *event); /* optional */ /* * Skip programming this PMU on the given CPU. Typically needed for * big.LITTLE things. */ bool (*filter) (struct pmu *pmu, int cpu); /* optional */ /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. */ int (*check_period) (struct perf_event *event, u64 value); /* optional */ }; enum perf_addr_filter_action_t { PERF_ADDR_FILTER_ACTION_STOP = 0, PERF_ADDR_FILTER_ACTION_START, PERF_ADDR_FILTER_ACTION_FILTER, }; /** * struct perf_addr_filter - address range filter definition * @entry: event's filter list linkage * @path: object file's path for file-based filters * @offset: filter range offset * @size: filter range size (size==0 means single address trigger) * @action: filter/start/stop * * This is a hardware-agnostic filter configuration as specified by the user. */ struct perf_addr_filter { struct list_head entry; struct path path; unsigned long offset; unsigned long size; enum perf_addr_filter_action_t action; }; /** * struct perf_addr_filters_head - container for address range filters * @list: list of filters for this event * @lock: spinlock that serializes accesses to the @list and event's * (and its children's) filter generations. * @nr_file_filters: number of file-based filters * * A child event will use parent's @list (and therefore @lock), so they are * bundled together; see perf_event_addr_filters(). */ struct perf_addr_filters_head { struct list_head list; raw_spinlock_t lock; unsigned int nr_file_filters; }; struct perf_addr_filter_range { unsigned long start; unsigned long size; }; /* * The normal states are: * * ACTIVE --. * ^ | * | | * sched_{in,out}() | * | | * v | * ,---> INACTIVE --+ <-. * | | | * | {dis,en}able() * sched_in() | | * | OFF <--' --+ * | | * `---> ERROR ------' * * That is: * * sched_in: INACTIVE -> {ACTIVE,ERROR} * sched_out: ACTIVE -> INACTIVE * disable: {ACTIVE,INACTIVE} -> OFF * enable: {OFF,ERROR} -> INACTIVE * * Where {OFF,ERROR} are disabled states. * * Then we have the {EXIT,REVOKED,DEAD} states which are various shades of * defunct events: * * - EXIT means task that the even was assigned to died, but child events * still live, and further children can still be created. But the event * itself will never be active again. It can only transition to * {REVOKED,DEAD}; * * - REVOKED means the PMU the event was associated with is gone; all * functionality is stopped but the event is still alive. Can only * transition to DEAD; * * - DEAD event really is DYING tearing down state and freeing bits. * */ enum perf_event_state { PERF_EVENT_STATE_DEAD = -5, PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */ PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */ PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */ PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, PERF_EVENT_STATE_ACTIVE = 1, }; struct file; struct perf_sample_data; typedef void (*perf_overflow_handler_t)(struct perf_event *, struct perf_sample_data *, struct pt_regs *regs); /* * Event capabilities. For event_caps and groups caps. * * PERF_EV_CAP_SOFTWARE: Is a software event. * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read * from any CPU in the package where it is active. * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and * cannot be a group leader. If an event with this flag is detached from the * group it is scheduled out and moved into an unrecoverable ERROR state. * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the * PMU scope where it is active. */ #define PERF_EV_CAP_SOFTWARE BIT(0) #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) #define PERF_EV_CAP_SIBLING BIT(2) #define PERF_EV_CAP_READ_SCOPE BIT(3) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) struct swevent_hlist { struct hlist_head heads[SWEVENT_HLIST_SIZE]; struct rcu_head rcu_head; }; #define PERF_ATTACH_CONTEXT 0x0001 #define PERF_ATTACH_GROUP 0x0002 #define PERF_ATTACH_TASK 0x0004 #define PERF_ATTACH_TASK_DATA 0x0008 #define PERF_ATTACH_GLOBAL_DATA 0x0010 #define PERF_ATTACH_SCHED_CB 0x0020 #define PERF_ATTACH_CHILD 0x0040 #define PERF_ATTACH_EXCLUSIVE 0x0080 #define PERF_ATTACH_CALLCHAIN 0x0100 #define PERF_ATTACH_ITRACE 0x0200 struct bpf_prog; struct perf_cgroup; struct perf_buffer; struct pmu_event_list { raw_spinlock_t lock; struct list_head list; }; /* * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex * as such iteration must hold either lock. However, since ctx->lock is an IRQ * safe lock, and is only held by the CPU doing the modification, having IRQs * disabled is sufficient since it will hold-off the IPIs. */ #ifdef CONFIG_PROVE_LOCKING # define lockdep_assert_event_ctx(event) \ WARN_ON_ONCE(__lockdep_enabled && \ (this_cpu_read(hardirqs_enabled) && \ lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) #else # define lockdep_assert_event_ctx(event) #endif #define for_each_sibling_event(sibling, event) \ lockdep_assert_event_ctx(event); \ if ((event)->group_leader == (event)) \ list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) /** * struct perf_event - performance event kernel representation: */ struct perf_event { #ifdef CONFIG_PERF_EVENTS /* * entry onto perf_event_context::event_list; * modifications require ctx->lock * RCU safe iterations. */ struct list_head event_entry; /* * Locked for modification by both ctx->mutex and ctx->lock; holding * either sufficies for read. */ struct list_head sibling_list; struct list_head active_list; /* * Node on the pinned or flexible tree located at the event context; */ struct rb_node group_node; u64 group_index; /* * We need storage to track the entries in perf_pmu_migrate_context; we * cannot use the event_entry because of RCU and we want to keep the * group in tact which avoids us using the other two entries. */ struct list_head migrate_entry; struct hlist_node hlist_entry; struct list_head active_entry; int nr_siblings; /* Not serialized. Only written during event initialization. */ int event_caps; /* The cumulative AND of all event_caps for events in this group. */ int group_caps; unsigned int group_generation; struct perf_event *group_leader; /* * event->pmu will always point to pmu in which this event belongs. * Whereas event->pmu_ctx->pmu may point to other pmu when group of * different pmu events is created. */ struct pmu *pmu; void *pmu_private; enum perf_event_state state; unsigned int attach_state; local64_t count; atomic64_t child_count; /* * These are the total time in nanoseconds that the event * has been enabled (i.e. eligible to run, and the task has * been scheduled in, if this is a per-task event) * and running (scheduled onto the CPU), respectively. */ u64 total_time_enabled; u64 total_time_running; u64 tstamp; struct perf_event_attr attr; u16 header_size; u16 id_header_size; u16 read_size; struct hw_perf_event hw; struct perf_event_context *ctx; /* * event->pmu_ctx points to perf_event_pmu_context in which the event * is added. This pmu_ctx can be of other pmu for sw event when that * sw event is part of a group which also contains non-sw events. */ struct perf_event_pmu_context *pmu_ctx; atomic_long_t refcount; /* * These accumulate total time (in nanoseconds) that children * events have been enabled and running, respectively. */ atomic64_t child_total_time_enabled; atomic64_t child_total_time_running; /* * Protect attach/detach and child_list: */ struct mutex child_mutex; struct list_head child_list; struct perf_event *parent; int oncpu; int cpu; struct list_head owner_entry; struct task_struct *owner; /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; unsigned long rcu_batches; int rcu_pending; /* poll related */ wait_queue_head_t waitq; struct fasync_struct *fasync; /* delayed work for NMIs and such */ unsigned int pending_wakeup; unsigned int pending_kill; unsigned int pending_disable; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; struct irq_work pending_disable_irq; struct callback_head pending_task; unsigned int pending_work; atomic_t event_limit; /* address range filters */ struct perf_addr_filters_head addr_filters; /* vma address array for file-based filders */ struct perf_addr_filter_range *addr_filter_ranges; unsigned long addr_filters_gen; /* for aux_output events */ struct perf_event *aux_event; void (*destroy)(struct perf_event *); struct rcu_head rcu_head; struct pid_namespace *ns; u64 id; atomic64_t lost_samples; u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; struct bpf_prog *prog; u64 bpf_cookie; #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; struct event_filter *filter; # ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops ftrace_ops; # endif #endif #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; /* cgroup event is attach to */ #endif #ifdef CONFIG_SECURITY void *security; #endif struct list_head sb_list; struct list_head pmu_list; /* * Certain events gets forwarded to another pmu internally by over- * writing kernel copy of event->attr.type without user being aware * of it. event->orig_type contains original 'type' requested by * user. */ u32 orig_type; #endif /* CONFIG_PERF_EVENTS */ }; /* * ,-----------------------[1:n]------------------------. * V V * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event * | | * `--[n:1]-> pmu <-[1:n]--' * * * struct perf_event_pmu_context lifetime is refcount based and RCU freed * (similar to perf_event_context). Locking is as if it were a member of * perf_event_context; specifically: * * modification, both: ctx->mutex && ctx->lock * reading, either: ctx->mutex || ctx->lock * * There is one exception to this; namely put_pmu_ctx() isn't always called * with ctx->mutex held; this means that as long as we can guarantee the epc * has events the above rules hold. * * Specificially, sys_perf_event_open()'s group_leader case depends on * ctx->mutex pinning the configuration. Since we hold a reference on * group_leader (through the filedesc) it can't go away, therefore it's * associated pmu_ctx must exist and cannot change due to ctx->mutex. * * perf_event holds a refcount on perf_event_context * perf_event holds a refcount on perf_event_pmu_context */ struct perf_event_pmu_context { struct pmu *pmu; struct perf_event_context *ctx; struct list_head pmu_ctx_entry; struct list_head pinned_active; struct list_head flexible_active; /* Used to identify the per-cpu perf_event_pmu_context */ unsigned int embedded : 1; unsigned int nr_events; unsigned int nr_cgroups; unsigned int nr_freq; atomic_t refcount; /* event <-> epc */ struct rcu_head rcu_head; /* * Set when one or more (plausibly active) event can't be scheduled * due to pmu overcommit or pmu constraints, except tolerant to * events not necessary to be active due to scheduling constraints, * such as cgroups. */ int rotate_necessary; }; static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc) { return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active); } struct perf_event_groups { struct rb_root tree; u64 index; }; /** * struct perf_event_context - event context structure * * Used as a container for task events and CPU events as well: */ struct perf_event_context { /* * Protect the states of the events in the list, * nr_active, and the list: */ raw_spinlock_t lock; /* * Protect the list of events. Locking either mutex or lock * is sufficient to ensure the list doesn't change; to change * the list you need to lock both the mutex and the spinlock. */ struct mutex mutex; struct list_head pmu_ctx_list; struct perf_event_groups pinned_groups; struct perf_event_groups flexible_groups; struct list_head event_list; int nr_events; int nr_user; int is_active; int nr_stat; int nr_freq; int rotate_disable; refcount_t refcount; /* event <-> ctx */ struct task_struct *task; /* * Context clock, runs when context enabled. */ u64 time; u64 timestamp; u64 timeoffset; /* * These fields let us detect when two contexts have both * been cloned (inherited) from a common ancestor. */ struct perf_event_context *parent_ctx; u64 parent_gen; u64 generation; int pin_count; #ifdef CONFIG_CGROUP_PERF int nr_cgroups; /* cgroup evts */ #endif struct rcu_head rcu_head; /* * The count of events for which using the switch-out fast path * should be avoided. * * Sum (event->pending_work + events with * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))) * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ local_t nr_no_switch_fast; }; /** * struct perf_ctx_data - PMU specific data for a task * @rcu_head: To avoid the race on free PMU specific data * @refcount: To track users * @global: To track system-wide users * @ctx_cache: Kmem cache of PMU specific data * @data: PMU specific data * * Currently, the struct is only used in Intel LBR call stack mode to * save/restore the call stack of a task on context switches. * * The rcu_head is used to prevent the race on free the data. * The data only be allocated when Intel LBR call stack mode is enabled. * The data will be freed when the mode is disabled. * The content of the data will only be accessed in context switch, which * should be protected by rcu_read_lock(). * * Because of the alignment requirement of Intel Arch LBR, the Kmem cache * is used to allocate the PMU specific data. The ctx_cache is to track * the Kmem cache. * * Careful: Struct perf_ctx_data is added as a pointer in struct task_struct. * When system-wide Intel LBR call stack mode is enabled, a buffer with * constant size will be allocated for each task. * Also, system memory consumption can further grow when the size of * struct perf_ctx_data enlarges. */ struct perf_ctx_data { struct rcu_head rcu_head; refcount_t refcount; int global; struct kmem_cache *ctx_cache; void *data; }; struct perf_cpu_pmu_context { struct perf_event_pmu_context epc; struct perf_event_pmu_context *task_epc; struct list_head sched_cb_entry; int sched_cb_usage; int active_oncpu; int exclusive; int pmu_disable_count; raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; ktime_t hrtimer_interval; unsigned int hrtimer_active; }; /** * struct perf_event_cpu_context - per cpu event context structure */ struct perf_cpu_context { struct perf_event_context ctx; struct perf_event_context *task_ctx; int online; #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; #endif /* * Per-CPU storage for iterators used in visit_groups_merge. The default * storage is of size 2 to hold the CPU and any CPU event iterators. */ int heap_size; struct perf_event **heap; struct perf_event *heap_default[2]; }; struct perf_output_handle { struct perf_event *event; struct perf_buffer *rb; unsigned long wakeup; unsigned long size; union { u64 flags; /* perf_output*() */ u64 aux_flags; /* perf_aux_output*() */ struct { u64 skip_read : 1; }; }; union { void *addr; unsigned long head; }; int page; }; struct bpf_perf_event_data_kern { bpf_user_pt_regs_t *regs; struct perf_sample_data *data; struct perf_event *event; }; #ifdef CONFIG_CGROUP_PERF /* * perf_cgroup_info keeps track of time_enabled for a cgroup. * This is a per-cpu dynamically allocated data structure. */ struct perf_cgroup_info { u64 time; u64 timestamp; u64 timeoffset; int active; }; struct perf_cgroup { struct cgroup_subsys_state css; struct perf_cgroup_info __percpu *info; }; /* * Must ensure cgroup is pinned (css_get) before calling * this function. In other words, we cannot call this function * if there is no cgroup event for the current CPU context. */ static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) { return container_of(task_css_check(task, perf_event_cgrp_id, ctx ? lockdep_is_held(&ctx->lock) : true), struct perf_cgroup, css); } #endif /* CONFIG_CGROUP_PERF */ #ifdef CONFIG_PERF_EVENTS extern struct perf_event_context *perf_cpu_task_ctx(void); extern void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event); extern void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size); extern int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size); extern void *perf_get_aux(struct perf_output_handle *handle); extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); extern int perf_pmu_unregister(struct pmu *pmu); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); extern void __perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next); extern int perf_event_init_task(struct task_struct *child, u64 clone_flags); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); extern struct file *perf_event_get(unsigned int fd); extern const struct perf_event *perf_get_event(struct file *file); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); extern void perf_sched_cb_dec(struct pmu *pmu); extern void perf_sched_cb_inc(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); extern void perf_pmu_resched(struct pmu *pmu); extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t callback, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); extern int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); static inline bool branch_sample_no_flags(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; } static inline bool branch_sample_no_cycles(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; } static inline bool branch_sample_type(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; } static inline bool branch_sample_hw_index(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; } static inline bool branch_sample_priv(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; } static inline bool branch_sample_counters(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS; } static inline bool branch_sample_call_stack(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; } struct perf_sample_data { /* * Fields set by perf_sample_data_init() unconditionally, * group so as to minimize the cachelines touched. */ u64 sample_flags; u64 period; u64 dyn_size; /* * Fields commonly set by __perf_event_header__init_id(), * group so as to minimize the cachelines touched. */ u64 type; struct { u32 pid; u32 tid; } tid_entry; u64 time; u64 id; struct { u32 cpu; u32 reserved; } cpu_entry; /* * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ u64 ip; struct perf_callchain_entry *callchain; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; u64 *br_stack_cntr; union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; struct perf_regs regs_user; struct perf_regs regs_intr; u64 stack_user_size; u64 stream_id; u64 cgroup; u64 addr; u64 phys_addr; u64 data_page_size; u64 code_page_size; u64 aux_size; } ____cacheline_aligned; /* default value for data source */ #define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\ PERF_MEM_S(LVL, NA) |\ PERF_MEM_S(SNOOP, NA) |\ PERF_MEM_S(LOCK, NA) |\ PERF_MEM_S(TLB, NA) |\ PERF_MEM_S(LVLNUM, NA)) static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ data->sample_flags = PERF_SAMPLE_PERIOD; data->period = period; data->dyn_size = 0; if (addr) { data->addr = addr; data->sample_flags |= PERF_SAMPLE_ADDR; } } static inline void perf_sample_save_callchain(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { int size = 1; if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return; if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) return; data->callchain = perf_callchain(event, regs); size += data->callchain->nr; data->dyn_size += size * sizeof(u64); data->sample_flags |= PERF_SAMPLE_CALLCHAIN; } static inline void perf_sample_save_raw_data(struct perf_sample_data *data, struct perf_event *event, struct perf_raw_record *raw) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; int size; if (!(event->attr.sample_type & PERF_SAMPLE_RAW)) return; if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW)) return; do { sum += frag->size; if (perf_raw_frag_last(frag)) break; frag = frag->next; } while (1); size = round_up(sum + sizeof(u32), sizeof(u64)); raw->size = size - sizeof(u32); frag->pad = raw->size - sum; data->raw = raw; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_RAW; } static inline bool has_branch_stack(struct perf_event *event) { return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; } static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, struct perf_branch_stack *brs, u64 *brs_cntr) { int size = sizeof(u64); /* nr */ if (!has_branch_stack(event)) return; if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK)) return; if (branch_sample_hw_index(event)) size += sizeof(u64); brs->nr = min_t(u16, event->attr.sample_max_stack, brs->nr); size += brs->nr * sizeof(struct perf_branch_entry); /* * The extension space for counters is appended after the * struct perf_branch_stack. It is used to store the occurrences * of events of each branch. */ if (brs_cntr) size += brs->nr * sizeof(u64); data->br_stack = brs; data->br_stack_cntr = brs_cntr; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } static inline u32 perf_sample_data_size(struct perf_sample_data *data, struct perf_event *event) { u32 size = sizeof(struct perf_event_header); size += event->header_size + event->id_header_size; size += data->dyn_size; return size; } /* * Clear all bitfields in the perf_branch_entry. * The to and from fields are not cleared because they are * systematically modified by caller. */ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) { br->mispred = 0; br->predicted = 0; br->in_tx = 0; br->abort = 0; br->cycles = 0; br->type = 0; br->spec = PERF_BR_SPEC_NA; br->reserved = 0; } extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_prepare_sample(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern void perf_prepare_header(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_forward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_backward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); static inline bool is_default_overflow_handler(struct perf_event *event) { perf_overflow_handler_t overflow_handler = event->overflow_handler; if (likely(overflow_handler == perf_event_output_forward)) return true; if (unlikely(overflow_handler == perf_event_output_backward)) return true; return false; } extern void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_event__output_id_sample(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *sample); extern void perf_log_lost_samples(struct perf_event *event, u64 lost); static inline bool event_has_any_exclude_flag(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; return attr->exclude_idle || attr->exclude_user || attr->exclude_kernel || attr->exclude_hv || attr->exclude_guest || attr->exclude_host; } static inline bool is_sampling_event(struct perf_event *event) { return event->attr.sample_period != 0; } /* * Return 1 for a software event, 0 for a hardware event */ static inline int is_software_event(struct perf_event *event) { return event->event_caps & PERF_EV_CAP_SOFTWARE; } /* * Return 1 for event in sw context, 0 for event in hw context */ static inline int in_software_context(struct perf_event *event) { return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context; } static inline int is_exclusive_pmu(struct pmu *pmu) { return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE; } extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64); extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); #ifndef perf_arch_fetch_caller_regs static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } #endif /* * When generating a perf sample in-line, instead of from an interrupt / * exception, we lack a pt_regs. This is typically used from software events * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints. * * We typically don't need a full set, but (for x86) do require: * - ip for PERF_SAMPLE_IP * - cs for user_mode() tests * - sp for PERF_SAMPLE_CALLCHAIN * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs()) * * NOTE: assumes @regs is otherwise already 0 filled; this is important for * things like PERF_SAMPLE_REGS_INTR. */ static inline void perf_fetch_caller_regs(struct pt_regs *regs) { perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); } static __always_inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { if (static_key_false(&perf_swevent_enabled[event_id])) __perf_sw_event(event_id, nr, regs, addr); } DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]); /* * 'Special' version for the scheduler, it hard assumes no recursion, * which is guaranteed by us not actually scheduling inside other swevents * because those disable preemption. */ static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); perf_fetch_caller_regs(regs); ___perf_sw_event(event_id, nr, regs, addr); } extern struct static_key_false perf_sched_events; static __always_inline bool __perf_sw_enabled(int swevt) { return static_key_false(&perf_swevent_enabled[swevt]); } static inline void perf_event_task_migrate(struct task_struct *task) { if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS)) task->sched_migrated = 1; } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_in(prev, task); if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) && task->sched_migrated) { __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); task->sched_migrated = 0; } } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES)) __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); #ifdef CONFIG_CGROUP_PERF if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) && perf_cgroup_from_task(prev, NULL) != perf_cgroup_from_task(next, NULL)) __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0); #endif if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_out(prev, next); } extern void perf_event_mmap(struct vm_area_struct *vma); extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym); extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); #define PERF_GUEST_ACTIVE 0x01 #define PERF_GUEST_USER 0x02 struct perf_guest_info_callbacks { unsigned int (*state)(void); unsigned long (*get_ip)(void); unsigned int (*handle_intel_pt_intr)(void); }; #ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); static inline unsigned int perf_guest_state(void) { return static_call(__perf_guest_state)(); } static inline unsigned long perf_guest_get_ip(void) { return static_call(__perf_guest_get_ip)(); } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return static_call(__perf_guest_handle_intel_pt_intr)(); } extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); #else /* !CONFIG_GUEST_PERF_EVENTS: */ static inline unsigned int perf_guest_state(void) { return 0; } static inline unsigned long perf_guest_get_ip(void) { return 0; } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } #endif /* !CONFIG_GUEST_PERF_EVENTS */ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_namespaces(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); extern void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); extern struct perf_callchain_entry *get_callchain_entry(int *rctx); extern void put_callchain_entry(int rctx); extern int sysctl_perf_event_max_stack; extern int sysctl_perf_event_max_contexts_per_stack; static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->contexts; return 0; } else { ctx->contexts_maxed = true; return -1; /* no more room, stop walking the stack */ } } static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->nr; return 0; } else { return -1; /* no more room, stop walking the stack */ } } extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_sample_rate; extern void perf_sample_event_took(u64 sample_len_ns); /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 /* Finer grained perf_event_open(2) access control. */ #define PERF_SECURITY_CPU 1 #define PERF_SECURITY_KERNEL 2 #define PERF_SECURITY_TRACEPOINT 3 static inline int perf_is_paranoid(void) { return sysctl_perf_event_paranoid > -1; } extern int perf_allow_kernel(void); static inline int perf_allow_cpu(void) { if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; return security_perf_event_open(PERF_SECURITY_CPU); } static inline int perf_allow_tracepoint(void) { if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; return security_perf_event_open(PERF_SECURITY_TRACEPOINT); } extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs); extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task); extern void perf_bp_event(struct perf_event *event, void *data); extern unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs); extern unsigned long perf_instruction_pointer(struct perf_event *event, struct pt_regs *regs); #ifndef perf_arch_misc_flags # define perf_arch_misc_flags(regs) \ (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL) # define perf_arch_instruction_pointer(regs) instruction_pointer(regs) #endif #ifndef perf_arch_bpf_user_pt_regs # define perf_arch_bpf_user_pt_regs(regs) regs #endif #ifndef perf_arch_guest_misc_flags static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) { unsigned long guest_state = perf_guest_state(); if (!(guest_state & PERF_GUEST_ACTIVE)) return 0; if (guest_state & PERF_GUEST_USER) return PERF_RECORD_MISC_GUEST_USER; else return PERF_RECORD_MISC_GUEST_KERNEL; } # define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) #endif static inline bool needs_branch_stack(struct perf_event *event) { return event->attr.branch_sample_type != 0; } static inline bool has_aux(struct perf_event *event) { return event->pmu && event->pmu->setup_aux; } static inline bool has_aux_action(struct perf_event *event) { return event->attr.aux_sample_size || event->attr.aux_pause || event->attr.aux_resume; } static inline bool is_write_backward(struct perf_event *event) { return !!event->attr.write_backward; } static inline bool has_addr_filter(struct perf_event *event) { return event->pmu->nr_addr_filters; } /* * An inherited event uses parent's filters */ static inline struct perf_addr_filters_head * perf_event_addr_filters(struct perf_event *event) { struct perf_addr_filters_head *ifh = &event->addr_filters; if (event->parent) ifh = &event->parent->addr_filters; return ifh; } static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) { /* Only the parent has fasync state */ if (event->parent) event = event->parent; return &event->fasync; } extern void perf_event_addr_filters_sync(struct perf_event *event); extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id); extern int perf_output_begin(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_forward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_backward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, struct perf_output_handle *handle, unsigned long from, unsigned long to); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) { return NULL; } static inline void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { } static inline int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { return -EINVAL; } static inline void * perf_get_aux(struct perf_output_handle *handle) { return NULL; } static inline void perf_event_task_migrate(struct task_struct *task) { } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { } static inline int perf_event_init_task(struct task_struct *child, u64 clone_flags) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } static inline const struct perf_event *perf_get_event(struct file *file) { return ERR_PTR(-EINVAL); } static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); } static inline int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { return -EINVAL; } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } static inline int perf_event_refresh(struct perf_event *event, int refresh) { return -EINVAL; } static inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { } typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { } static inline void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } static inline u64 perf_swevent_set_period(struct perf_event *event) { return 0; } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } static inline int perf_event_period(struct perf_event *event, u64 value) { return -EINVAL; } static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { return 0; } #endif /* !CONFIG_PERF_EVENTS */ #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern void perf_restore_debug_store(void); #else static inline void perf_restore_debug_store(void) { } #endif #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) struct perf_pmu_events_attr { struct device_attribute attr; u64 id; const char *event_str; }; struct perf_pmu_events_ht_attr { struct device_attribute attr; u64 id; const char *event_str_ht; const char *event_str_noht; }; struct perf_pmu_events_hybrid_attr { struct device_attribute attr; u64 id; const char *event_str; u64 pmu_type; }; struct perf_pmu_format_hybrid_attr { struct device_attribute attr; u64 pmu_type; }; ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); #define PMU_EVENT_ATTR(_name, _var, _id, _show) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, \ }; #define PMU_EVENT_ATTR_STRING(_name, _var, _str) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ .id = 0, \ .event_str = _str, \ }; #define PMU_EVENT_ATTR_ID(_name, _show, _id) \ (&((struct perf_pmu_events_attr[]) { \ { .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, } \ })[0].attr.attr) #define PMU_FORMAT_ATTR_SHOW(_name, _format) \ static ssize_t \ _name##_show(struct device *dev, \ struct device_attribute *attr, \ char *page) \ { \ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ return sprintf(page, _format "\n"); \ } \ #define PMU_FORMAT_ATTR(_name, _format) \ PMU_FORMAT_ATTR_SHOW(_name, _format) \ \ static struct device_attribute format_attr_##_name = __ATTR_RO(_name) /* Performance counter hotplug functions */ #ifdef CONFIG_PERF_EVENTS extern int perf_event_init_cpu(unsigned int cpu); extern int perf_event_exit_cpu(unsigned int cpu); #else # define perf_event_init_cpu NULL # define perf_event_exit_cpu NULL #endif extern void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now); /* * Snapshot branch stack on software events. * * Branch stack can be very useful in understanding software events. For * example, when a long function, e.g. sys_perf_event_open, returns an * errno, it is not obvious why the function failed. Branch stack could * provide very helpful information in this type of scenarios. * * On software event, it is necessary to stop the hardware branch recorder * fast. Otherwise, the hardware register/buffer will be flushed with * entries of the triggering event. Therefore, static call is used to * stop the hardware recorder. */ /* * cnt is the number of entries allocated for entries. * Return number of entries copied to . */ typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries, unsigned int cnt); DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); #ifndef PERF_NEEDS_LOPWR_CB static inline void perf_lopwr_cb(bool mode) { } #endif #endif /* _LINUX_PERF_EVENT_H */
117 10 115 105 10 1 10 8 401 10 91 127 2 336 2 313 2 2 216 2 3 208 248 122 26 265 457 288 4 276 433 7 23 22 3 1 22 20 20 22 3 55 3 15 2 34 4 43 6 37 12 2 9 8 2 2 29 10 3 1 3 4 1 7 293 2 6 85 71 5 15 92 391 260 118 8 2 2 2 28 141 389 9 12 371 12 13 11 2 4 2 4 12 2 5 7 12 8 20 20 12 19 89 83 9 17 13 4 13 1 5 17 17 14 1 17 28 14 45 1 18 1 25 27 45 96 94 86 1 13 1 66 34 53 66 48 8 16 16 16 7 9 63 15 46 46 1 38 5 34 11 11 11 78 83 70 94 28 5 20 26 18 26 26 26 28 28 28 4 32 4 33 12 44 1 43 12 36 4 33 5 2 26 231 164 151 158 163 6 229 1 156 155 139 41 127 84 227 110 80 20 80 7 35 107 159 35 139 33 51 3 129 33 61 112 174 132 165 138 1 74 138 122 37 27 10 68 12 6 55 8 10 2 112 117 64 126 1 1 7 7 8 110 54 14 1 84 14 95 116 2 116 112 16 130 16 1 1 2 134 1 134 138 347 42 52 364 227 35 180 28 21 7 32 11 11 168 21 24 9 9 6 114 114 101 19 114 1 26 2 99 2 9 7 24 23 81 24 3 39 76 1 1 2 68 4 5 78 11 113 113 114 101 102 70 2 2 59 8 28 22 8 27 59 63 6 58 59 27 69 69 15 49 68 70 74 30 66 10 68 29 73 69 24 97 33 97 10 2 17 17 74 85 67 28 13 15 73 31 16 1 175 176 131 53 146 5 4 28 25 6 8 23 176 176 188 187 16 109 161 10 1 2 10 14 112 10 2 136 112 2 129 5 383 59 24 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr.c * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> * * Fix by Harrison Xing <harrison@mountainviewdata.com>. * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>. * Extended attributes for symlinks and special files added per * suggestion of Luka Renko <luka.renko@hermes.si>. * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>, * Red Hat Inc. * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz * and Andreas Gruenbacher <agruen@suse.de>. */ /* * Extended attributes are stored directly in inodes (on file systems with * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl * field contains the block number if an inode uses an additional block. All * attributes must fit in the inode and one additional block. Blocks that * contain the identical set of attributes may be shared among several inodes. * Identical blocks are detected by keeping a cache of blocks that have * recently been accessed. * * The attributes in inodes and on blocks have a different header; the entries * are stored in the same format: * * +------------------+ * | header | * | entry 1 | | * | entry 2 | | growing downwards * | entry 3 | v * | four null bytes | * | . . . | * | value 1 | ^ * | value 3 | | growing upwards * | value 2 | | * +------------------+ * * The header is followed by multiple entry descriptors. In disk blocks, the * entry descriptors are kept sorted. In inodes, they are unsorted. The * attribute values are aligned to the end of the block in no specific order. * * Locking strategy * ---------------- * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem. * EA blocks are only changed if they are exclusive to an inode, so * holding xattr_sem also means that nothing but the EA block's reference * count can change. Multiple writers to the same block are synchronized * by the buffer lock. */ #include <linux/init.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/mbcache.h> #include <linux/quotaops.h> #include <linux/iversion.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "acl.h" #ifdef EXT4_XATTR_DEBUG # define ea_idebug(inode, fmt, ...) \ printk(KERN_DEBUG "inode %s:%lu: " fmt "\n", \ inode->i_sb->s_id, inode->i_ino, ##__VA_ARGS__) # define ea_bdebug(bh, fmt, ...) \ printk(KERN_DEBUG "block %pg:%lu: " fmt "\n", \ bh->b_bdev, (unsigned long)bh->b_blocknr, ##__VA_ARGS__) #else # define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif static void ext4_xattr_block_cache_insert(struct mb_cache *, struct buffer_head *); static struct buffer_head * ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *, struct mb_cache_entry **); static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, size_t value_count); static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value, size_t value_count); static void ext4_xattr_rehash(struct ext4_xattr_header *); static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, #endif [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler, }; const struct xattr_handler * const ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, #endif &ext4_xattr_hurd_handler, NULL }; #define EA_BLOCK_CACHE(inode) (((struct ext4_sb_info *) \ inode->i_sb->s_fs_info)->s_ea_block_cache) #define EA_INODE_CACHE(inode) (((struct ext4_sb_info *) \ inode->i_sb->s_fs_info)->s_ea_inode_cache) static int ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, struct inode *inode); #ifdef CONFIG_LOCKDEP void ext4_xattr_inode_set_class(struct inode *ea_inode) { struct ext4_inode_info *ei = EXT4_I(ea_inode); lockdep_set_subclass(&ea_inode->i_rwsem, 1); (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_EA); } #endif static __le32 ext4_xattr_block_csum(struct inode *inode, sector_t block_nr, struct ext4_xattr_header *hdr) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le64 dsk_block_nr = cpu_to_le64(block_nr); __u32 dummy_csum = 0; int offset = offsetof(struct ext4_xattr_header, h_checksum); csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); csum = ext4_chksum(csum, (__u8 *)hdr, offset); csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); csum = ext4_chksum(csum, (__u8 *)hdr + offset, EXT4_BLOCK_SIZE(inode->i_sb) - offset); return cpu_to_le32(csum); } static int ext4_xattr_block_csum_verify(struct inode *inode, struct buffer_head *bh) { struct ext4_xattr_header *hdr = BHDR(bh); int ret = 1; if (ext4_has_feature_metadata_csum(inode->i_sb)) { lock_buffer(bh); ret = (hdr->h_checksum == ext4_xattr_block_csum(inode, bh->b_blocknr, hdr)); unlock_buffer(bh); } return ret; } static void ext4_xattr_block_csum_set(struct inode *inode, struct buffer_head *bh) { if (ext4_has_feature_metadata_csum(inode->i_sb)) BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode, bh->b_blocknr, BHDR(bh)); } static inline const char *ext4_xattr_prefix(int name_index, struct dentry *dentry) { const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) handler = ext4_xattr_handler_map[name_index]; if (!xattr_handler_can_list(handler, dentry)) return NULL; return xattr_prefix(handler); } static int check_xattrs(struct inode *inode, struct buffer_head *bh, struct ext4_xattr_entry *entry, void *end, void *value_start, const char *function, unsigned int line) { struct ext4_xattr_entry *e = entry; int err = -EFSCORRUPTED; char *err_str; if (bh) { if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) { err_str = "invalid header"; goto errout; } if (buffer_verified(bh)) return 0; if (!ext4_xattr_block_csum_verify(inode, bh)) { err = -EFSBADCRC; err_str = "invalid checksum"; goto errout; } } else { struct ext4_xattr_ibody_header *header = value_start; header -= 1; if (end - (void *)header < sizeof(*header) + sizeof(u32)) { err_str = "in-inode xattr block too small"; goto errout; } if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { err_str = "bad magic number in in-inode xattr"; goto errout; } } /* Find the end of the names list */ while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); if ((void *)next >= end) { err_str = "e_name out of bounds"; goto errout; } if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) { err_str = "bad e_name length"; goto errout; } e = next; } /* Check the values */ while (!IS_LAST_ENTRY(entry)) { u32 size = le32_to_cpu(entry->e_value_size); unsigned long ea_ino = le32_to_cpu(entry->e_value_inum); if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) { err_str = "ea_inode specified without ea_inode feature enabled"; goto errout; } if (ea_ino && ((ea_ino == EXT4_ROOT_INO) || !ext4_valid_inum(inode->i_sb, ea_ino))) { err_str = "invalid ea_ino"; goto errout; } if (size > EXT4_XATTR_SIZE_MAX) { err_str = "e_value size too large"; goto errout; } if (size != 0 && entry->e_value_inum == 0) { u16 offs = le16_to_cpu(entry->e_value_offs); void *value; /* * The value cannot overlap the names, and the value * with padding cannot extend beyond 'end'. Check both * the padded and unpadded sizes, since the size may * overflow to 0 when adding padding. */ if (offs > end - value_start) { err_str = "e_value out of bounds"; goto errout; } value = value_start + offs; if (value < (void *)e + sizeof(u32) || size > end - value || EXT4_XATTR_SIZE(size) > end - value) { err_str = "overlapping e_value "; goto errout; } } entry = EXT4_XATTR_NEXT(entry); } if (bh) set_buffer_verified(bh); return 0; errout: if (bh) __ext4_error_inode(inode, function, line, 0, -err, "corrupted xattr block %llu: %s", (unsigned long long) bh->b_blocknr, err_str); else __ext4_error_inode(inode, function, line, 0, -err, "corrupted in-inode xattr: %s", err_str); return err; } static inline int __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, const char *function, unsigned int line) { return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size, bh->b_data, function, line); } #define ext4_xattr_check_block(inode, bh) \ __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header), function, line); } static int xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, void *end, int name_index, const char *name, int sorted) { struct ext4_xattr_entry *entry, *next; size_t name_len; int cmp = 1; if (name == NULL) return -EINVAL; name_len = strlen(name); for (entry = *pentry; !IS_LAST_ENTRY(entry); entry = next) { next = EXT4_XATTR_NEXT(entry); if ((void *) next >= end) { EXT4_ERROR_INODE(inode, "corrupted xattr entries"); return -EFSCORRUPTED; } cmp = name_index - entry->e_name_index; if (!cmp) cmp = name_len - entry->e_name_len; if (!cmp) cmp = memcmp(name, entry->e_name, name_len); if (!cmp || (cmp < 0 && sorted)) break; } *pentry = entry; return cmp ? -ENODATA : 0; } static u32 ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) { return ext4_chksum(sbi->s_csum_seed, buffer, size); } static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) { return ((u64) inode_get_ctime_sec(ea_inode) << 32) | (u32) inode_peek_iversion_raw(ea_inode); } static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) { inode_set_ctime(ea_inode, (u32)(ref_count >> 32), 0); inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff); } static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) { return (u32) inode_get_atime_sec(ea_inode); } static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) { inode_set_atime(ea_inode, hash, 0); } /* * Read the EA value from an inode. */ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) { int blocksize = 1 << ea_inode->i_blkbits; int bh_count = (size + blocksize - 1) >> ea_inode->i_blkbits; int tail_size = (size % blocksize) ?: blocksize; struct buffer_head *bhs_inline[8]; struct buffer_head **bhs = bhs_inline; int i, ret; if (bh_count > ARRAY_SIZE(bhs_inline)) { bhs = kmalloc_array(bh_count, sizeof(*bhs), GFP_NOFS); if (!bhs) return -ENOMEM; } ret = ext4_bread_batch(ea_inode, 0 /* block */, bh_count, true /* wait */, bhs); if (ret) goto free_bhs; for (i = 0; i < bh_count; i++) { /* There shouldn't be any holes in ea_inode. */ if (!bhs[i]) { ret = -EFSCORRUPTED; goto put_bhs; } memcpy((char *)buf + blocksize * i, bhs[i]->b_data, i < bh_count - 1 ? blocksize : tail_size); } ret = 0; put_bhs: for (i = 0; i < bh_count; i++) brelse(bhs[i]); free_bhs: if (bhs != bhs_inline) kfree(bhs); return ret; } #define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode))) static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, u32 ea_inode_hash, struct inode **ea_inode) { struct inode *inode; int err; /* * We have to check for this corruption early as otherwise * iget_locked() could wait indefinitely for the state of our * parent inode. */ if (parent->i_ino == ea_ino) { ext4_error(parent->i_sb, "Parent and EA inode have the same ino %lu", ea_ino); return -EFSCORRUPTED; } inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_EA_INODE); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error(parent->i_sb, "error while reading EA inode %lu err=%d", ea_ino, err); return err; } ext4_xattr_inode_set_class(inode); /* * Check whether this is an old Lustre-style xattr inode. Lustre * implementation does not have hash validation, rather it has a * backpointer from ea_inode to the parent inode. */ if (ea_inode_hash != ext4_xattr_inode_get_hash(inode) && EXT4_XATTR_INODE_GET_PARENT(inode) == parent->i_ino && inode->i_generation == parent->i_generation) { ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE); ext4_xattr_inode_set_ref(inode, 1); } else { inode_lock_nested(inode, I_MUTEX_XATTR); inode->i_flags |= S_NOQUOTA; inode_unlock(inode); } *ea_inode = inode; return 0; } /* Remove entry from mbcache when EA inode is getting evicted */ void ext4_evict_ea_inode(struct inode *inode) { struct mb_cache_entry *oe; if (!EA_INODE_CACHE(inode)) return; /* Wait for entry to get unused so that we can remove it */ while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode), ext4_xattr_inode_get_hash(inode), inode->i_ino))) { mb_cache_entry_wait_unused(oe); mb_cache_entry_put(EA_INODE_CACHE(inode), oe); } } static int ext4_xattr_inode_verify_hashes(struct inode *ea_inode, struct ext4_xattr_entry *entry, void *buffer, size_t size) { u32 hash; /* Verify stored hash matches calculated hash. */ hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size); if (hash != ext4_xattr_inode_get_hash(ea_inode)) return -EFSCORRUPTED; if (entry) { __le32 e_hash, tmp_data; /* Verify entry hash. */ tmp_data = cpu_to_le32(hash); e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len, &tmp_data, 1); /* All good? */ if (e_hash == entry->e_hash) return 0; /* * Not good. Maybe the entry hash was calculated * using the buggy signed char version? */ e_hash = ext4_xattr_hash_entry_signed(entry->e_name, entry->e_name_len, &tmp_data, 1); /* Still no match - bad */ if (e_hash != entry->e_hash) return -EFSCORRUPTED; /* Let people know about old hash */ pr_warn_once("ext4: filesystem with signed xattr name hash"); } return 0; } /* * Read xattr value from the EA inode. */ static int ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, void *buffer, size_t size) { struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); struct inode *ea_inode; int err; err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), le32_to_cpu(entry->e_hash), &ea_inode); if (err) { ea_inode = NULL; goto out; } if (i_size_read(ea_inode) != size) { ext4_warning_inode(ea_inode, "ea_inode file size=%llu entry size=%zu", i_size_read(ea_inode), size); err = -EFSCORRUPTED; goto out; } err = ext4_xattr_inode_read(ea_inode, buffer, size); if (err) goto out; if (!ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) { err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, size); if (err) { ext4_warning_inode(ea_inode, "EA inode hash validation failed"); goto out; } if (ea_inode_cache) mb_cache_entry_create(ea_inode_cache, GFP_NOFS, ext4_xattr_inode_get_hash(ea_inode), ea_inode->i_ino, true /* reusable */); } out: iput(ea_inode); return err; } static int ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { struct buffer_head *bh = NULL; struct ext4_xattr_entry *entry; size_t size; void *end; int error; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); if (!EXT4_I(inode)->i_file_acl) return -ENODATA; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; ext4_xattr_block_cache_insert(ea_block_cache, bh); entry = BFIRST(bh); end = bh->b_data + bh->b_size; error = xattr_find_entry(inode, &entry, end, name_index, name, 1); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); error = -ERANGE; if (unlikely(size > EXT4_XATTR_SIZE_MAX)) goto cleanup; if (buffer) { if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { error = ext4_xattr_inode_get(inode, entry, buffer, size); if (error) goto cleanup; } else { u16 offset = le16_to_cpu(entry->e_value_offs); void *p = bh->b_data + offset; if (unlikely(p + size > end)) goto cleanup; memcpy(buffer, p, size); } } error = size; cleanup: brelse(bh); return error; } int ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; struct ext4_inode *raw_inode; struct ext4_iloc iloc; size_t size; void *end; int error; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return -ENODATA; error = ext4_get_inode_loc(inode, &iloc); if (error) return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = ITAIL(inode, raw_inode); entry = IFIRST(header); error = xattr_find_entry(inode, &entry, end, name_index, name, 0); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); error = -ERANGE; if (unlikely(size > EXT4_XATTR_SIZE_MAX)) goto cleanup; if (buffer) { if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { error = ext4_xattr_inode_get(inode, entry, buffer, size); if (error) goto cleanup; } else { u16 offset = le16_to_cpu(entry->e_value_offs); void *p = (void *)IFIRST(header) + offset; if (unlikely(p + size > end)) goto cleanup; memcpy(buffer, p, size); } } error = size; cleanup: brelse(iloc.bh); return error; } /* * ext4_xattr_get() * * Copy an extended attribute into the buffer * provided, or compute the buffer size required. * Buffer is NULL to compute the size of the buffer required. * * Returns a negative error number on failure, or the number of bytes * used / required on success. */ int ext4_xattr_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { int error; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (strlen(name) > 255) return -ERANGE; down_read(&EXT4_I(inode)->xattr_sem); error = ext4_xattr_ibody_get(inode, name_index, name, buffer, buffer_size); if (error == -ENODATA) error = ext4_xattr_block_get(inode, name_index, name, buffer, buffer_size); up_read(&EXT4_I(inode)->xattr_sem); return error; } static int ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, char *buffer, size_t buffer_size) { size_t rest = buffer_size; for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { const char *prefix; prefix = ext4_xattr_prefix(entry->e_name_index, dentry); if (prefix) { size_t prefix_len = strlen(prefix); size_t size = prefix_len + entry->e_name_len + 1; if (buffer) { if (size > rest) return -ERANGE; memcpy(buffer, prefix, prefix_len); buffer += prefix_len; memcpy(buffer, entry->e_name, entry->e_name_len); buffer += entry->e_name_len; *buffer++ = 0; } rest -= size; } } return buffer_size - rest; /* total size */ } static int ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); if (!EXT4_I(inode)->i_file_acl) return 0; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: brelse(bh); return error; } static int ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = d_inode(dentry); struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; int error; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return 0; error = ext4_get_inode_loc(inode, &iloc); if (error) return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); error = ext4_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); brelse(iloc.bh); return error; } /* * Inode operation listxattr() * * d_inode(dentry)->i_rwsem: don't care * * Copy a list of attribute names into the buffer * provided, or compute the buffer size required. * Buffer is NULL to compute the size of the buffer required. * * Returns a negative error number on failure, or the number of bytes * used / required on success. */ ssize_t ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { int ret, ret2; down_read(&EXT4_I(d_inode(dentry))->xattr_sem); ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); if (ret < 0) goto errout; if (buffer) { buffer += ret; buffer_size -= ret; } ret = ext4_xattr_block_list(dentry, buffer, buffer_size); if (ret < 0) goto errout; ret += ret2; errout: up_read(&EXT4_I(d_inode(dentry))->xattr_sem); return ret; } /* * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is * not set, set it. */ static void ext4_xattr_update_super_block(handle_t *handle, struct super_block *sb) { if (ext4_has_feature_xattr(sb)) return; BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); if (ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE) == 0) { lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_xattr(sb); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } } int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) { struct ext4_iloc iloc = { .bh = NULL }; struct buffer_head *bh = NULL; struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; qsize_t ea_inode_refs = 0; int ret; lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { ret = ext4_get_inode_loc(inode, &iloc); if (ret) goto out; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) if (entry->e_value_inum) ea_inode_refs++; } if (EXT4_I(inode)->i_file_acl) { bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { ret = PTR_ERR(bh); bh = NULL; goto out; } ret = ext4_xattr_check_block(inode, bh); if (ret) goto out; for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) if (entry->e_value_inum) ea_inode_refs++; } *usage = ea_inode_refs + 1; ret = 0; out: brelse(iloc.bh); brelse(bh); return ret; } static inline size_t round_up_cluster(struct inode *inode, size_t length) { struct super_block *sb = inode->i_sb; size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits + inode->i_blkbits); size_t mask = ~(cluster_size - 1); return (length + cluster_size - 1) & mask; } static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len) { int err; err = dquot_alloc_inode(inode); if (err) return err; err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len)); if (err) dquot_free_inode(inode); return err; } static void ext4_xattr_inode_free_quota(struct inode *parent, struct inode *ea_inode, size_t len) { if (ea_inode && ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) return; dquot_free_space_nodirty(parent, round_up_cluster(parent, len)); dquot_free_inode(parent); } int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, struct buffer_head *block_bh, size_t value_len, bool is_create) { int credits; int blocks; /* * 1) Owner inode update * 2) Ref count update on old xattr block * 3) new xattr block * 4) block bitmap update for new xattr block * 5) group descriptor for new xattr block * 6) block bitmap update for old xattr block * 7) group descriptor for old block * * 6 & 7 can happen if we have two racing threads T_a and T_b * which are each trying to set an xattr on inodes I_a and I_b * which were both initially sharing an xattr block. */ credits = 7; /* Quota updates. */ credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb); /* * In case of inline data, we may push out the data to a block, * so we need to reserve credits for this eventuality */ if (inode && ext4_has_inline_data(inode)) credits += ext4_chunk_trans_extent(inode, 1) + 1; /* We are done if ea_inode feature is not enabled. */ if (!ext4_has_feature_ea_inode(sb)) return credits; /* New ea_inode, inode map, block bitmap, group descriptor. */ credits += 4; /* Data blocks. */ blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits; /* Indirection block or one level of extent tree. */ blocks += 1; /* Block bitmap and group descriptor updates for each block. */ credits += blocks * 2; /* Blocks themselves. */ credits += blocks; if (!is_create) { /* Dereference ea_inode holding old xattr value. * Old ea_inode, inode map, block bitmap, group descriptor. */ credits += 4; /* Data blocks for old ea_inode. */ blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits; /* Indirection block or one level of extent tree for old * ea_inode. */ blocks += 1; /* Block bitmap and group descriptor updates for each block. */ credits += blocks * 2; } /* We may need to clone the existing xattr block in which case we need * to increment ref counts for existing ea_inodes referenced by it. */ if (block_bh) { struct ext4_xattr_entry *entry = BFIRST(block_bh); for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) if (entry->e_value_inum) /* Ref count update on ea_inode. */ credits += 1; } return credits; } static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, int ref_change) { struct ext4_iloc iloc; s64 ref_count; int ret; inode_lock_nested(ea_inode, I_MUTEX_XATTR); ret = ext4_reserve_inode_write(handle, ea_inode, &iloc); if (ret) goto out; ref_count = ext4_xattr_inode_get_ref(ea_inode); ref_count += ref_change; ext4_xattr_inode_set_ref(ea_inode, ref_count); if (ref_change > 0) { WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld", ea_inode->i_ino, ref_count); if (ref_count == 1) { WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); set_nlink(ea_inode, 1); ext4_orphan_del(handle, ea_inode); } } else { WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", ea_inode->i_ino, ref_count); if (ref_count == 0) { WARN_ONCE(ea_inode->i_nlink != 1, "EA inode %lu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); clear_nlink(ea_inode); ext4_orphan_add(handle, ea_inode); } } ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc); if (ret) ext4_warning_inode(ea_inode, "ext4_mark_iloc_dirty() failed ret=%d", ret); out: inode_unlock(ea_inode); return ret; } static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode) { return ext4_xattr_inode_update_ref(handle, ea_inode, 1); } static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode) { return ext4_xattr_inode_update_ref(handle, ea_inode, -1); } static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, struct ext4_xattr_entry *first) { struct inode *ea_inode; struct ext4_xattr_entry *entry; struct ext4_xattr_entry *failed_entry; unsigned int ea_ino; int err, saved_err; for (entry = first; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); if (err) goto cleanup; err = ext4_xattr_inode_inc_ref(handle, ea_inode); if (err) { ext4_warning_inode(ea_inode, "inc ref error %d", err); iput(ea_inode); goto cleanup; } iput(ea_inode); } return 0; cleanup: saved_err = err; failed_entry = entry; for (entry = first; entry != failed_entry; entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); if (err) { ext4_warning(parent->i_sb, "cleanup ea_ino %u iget error %d", ea_ino, err); continue; } err = ext4_xattr_inode_dec_ref(handle, ea_inode); if (err) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); iput(ea_inode); } return saved_err; } static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh, bool block_csum, bool dirty) { int error; if (bh && dirty) { if (block_csum) ext4_xattr_block_csum_set(inode, bh); error = ext4_handle_dirty_metadata(handle, NULL, bh); if (error) { ext4_warning(inode->i_sb, "Handle metadata (error %d)", error); return error; } } return 0; } static void ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, struct buffer_head *bh, struct ext4_xattr_entry *first, bool block_csum, struct ext4_xattr_inode_array **ea_inode_array, int extra_credits, bool skip_quota) { struct inode *ea_inode; struct ext4_xattr_entry *entry; struct ext4_iloc iloc; bool dirty = false; unsigned int ea_ino; int err; int credits; void *end; if (block_csum) end = (void *)bh->b_data + bh->b_size; else { ext4_get_inode_loc(parent, &iloc); end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size; } /* One credit for dec ref on ea_inode, one for orphan list addition, */ credits = 2 + extra_credits; for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); if (err) continue; err = ext4_expand_inode_array(ea_inode_array, ea_inode); if (err) { ext4_warning_inode(ea_inode, "Expand inode array err=%d", err); iput(ea_inode); continue; } err = ext4_journal_ensure_credits_fn(handle, credits, credits, ext4_free_metadata_revoke_credits(parent->i_sb, 1), ext4_xattr_restart_fn(handle, parent, bh, block_csum, dirty)); if (err < 0) { ext4_warning_inode(ea_inode, "Ensure credits err=%d", err); continue; } if (err > 0) { err = ext4_journal_get_write_access(handle, parent->i_sb, bh, EXT4_JTR_NONE); if (err) { ext4_warning_inode(ea_inode, "Re-get write access err=%d", err); continue; } } err = ext4_xattr_inode_dec_ref(handle, ea_inode); if (err) { ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d", err); continue; } if (!skip_quota) ext4_xattr_inode_free_quota(parent, ea_inode, le32_to_cpu(entry->e_value_size)); /* * Forget about ea_inode within the same transaction that * decrements the ref count. This avoids duplicate decrements in * case the rest of the work spills over to subsequent * transactions. */ entry->e_value_inum = 0; entry->e_value_size = 0; dirty = true; } if (dirty) { /* * Note that we are deliberately skipping csum calculation for * the final update because we do not expect any journal * restarts until xattr block is freed. */ err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) ext4_warning_inode(parent, "handle dirty metadata err=%d", err); } } /* * Release the xattr block BH: If the reference count is > 1, decrement it; * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, struct buffer_head *bh, struct ext4_xattr_inode_array **ea_inode_array, int extra_credits) { struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); u32 hash, ref; int error = 0; BUFFER_TRACE(bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (error) goto out; retry_ref: lock_buffer(bh); hash = le32_to_cpu(BHDR(bh)->h_hash); ref = le32_to_cpu(BHDR(bh)->h_refcount); if (ref == 1) { ea_bdebug(bh, "refcount now=0; freeing"); /* * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ if (ea_block_cache) { struct mb_cache_entry *oe; oe = mb_cache_entry_delete_or_get(ea_block_cache, hash, bh->b_blocknr); if (oe) { unlock_buffer(bh); mb_cache_entry_wait_unused(oe); mb_cache_entry_put(ea_block_cache, oe); goto retry_ref; } } get_bh(bh); unlock_buffer(bh); if (ext4_has_feature_ea_inode(inode->i_sb)) ext4_xattr_inode_dec_ref_all(handle, inode, bh, BFIRST(bh), true /* block_csum */, ea_inode_array, extra_credits, true /* skip_quota */); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } else { ref--; BHDR(bh)->h_refcount = cpu_to_le32(ref); if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { struct mb_cache_entry *ce; if (ea_block_cache) { ce = mb_cache_entry_get(ea_block_cache, hash, bh->b_blocknr); if (ce) { set_bit(MBE_REUSABLE_B, &ce->e_flags); mb_cache_entry_put(ea_block_cache, ce); } } } ext4_xattr_block_csum_set(inode, bh); /* * Beware of this ugliness: Releasing of xattr block references * from different inodes can race and so we have to protect * from a race where someone else frees the block (and releases * its journal_head) before we are done dirtying the buffer. In * nojournal mode this race is harmless and we actually cannot * call ext4_handle_dirty_metadata() with locked buffer as * that function can call sync_dirty_buffer() so for that case * we handle the dirtying after unlocking the buffer. */ if (ext4_handle_valid(handle)) error = ext4_handle_dirty_metadata(handle, inode, bh); unlock_buffer(bh); if (!ext4_handle_valid(handle)) error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); } out: ext4_std_error(inode->i_sb, error); return; } /* * Find the available free space for EAs. This also returns the total number of * bytes used by EA entries. */ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; } if (total) *total += EXT4_XATTR_LEN(last->e_name_len); } return (*min_offs - ((void *)last - base) - sizeof(__u32)); } /* * Write the value of the EA in an inode. */ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, const void *buf, int bufsize) { struct buffer_head *bh = NULL; unsigned long block = 0; int blocksize = ea_inode->i_sb->s_blocksize; int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; int csize, wsize = 0; int ret = 0, ret2 = 0; int retries = 0; retry: while (ret >= 0 && ret < max_blocks) { struct ext4_map_blocks map; map.m_lblk = block += ret; map.m_len = max_blocks -= ret; ret = ext4_map_blocks(handle, ea_inode, &map, EXT4_GET_BLOCKS_CREATE); if (ret <= 0) { ext4_mark_inode_dirty(handle, ea_inode); if (ret == -ENOSPC && ext4_should_retry_alloc(ea_inode->i_sb, &retries)) { ret = 0; goto retry; } break; } } if (ret < 0) return ret; block = 0; while (wsize < bufsize) { brelse(bh); csize = (bufsize - wsize) > blocksize ? blocksize : bufsize - wsize; bh = ext4_getblk(handle, ea_inode, block, 0); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) { WARN_ON_ONCE(1); EXT4_ERROR_INODE(ea_inode, "ext4_getblk() return bh = NULL"); return -EFSCORRUPTED; } ret = ext4_journal_get_write_access(handle, ea_inode->i_sb, bh, EXT4_JTR_NONE); if (ret) goto out; memcpy(bh->b_data, buf, csize); /* * Zero out block tail to avoid writing uninitialized memory * to disk. */ if (csize < blocksize) memset(bh->b_data + csize, 0, blocksize - csize); set_buffer_uptodate(bh); ext4_handle_dirty_metadata(handle, ea_inode, bh); buf += csize; wsize += csize; block += 1; } inode_lock(ea_inode); i_size_write(ea_inode, wsize); ext4_update_i_disksize(ea_inode, wsize); inode_unlock(ea_inode); ret2 = ext4_mark_inode_dirty(handle, ea_inode); if (unlikely(ret2 && !ret)) ret = ret2; out: brelse(bh); return ret; } /* * Create an inode to store the value of a large EA. */ static struct inode *ext4_xattr_inode_create(handle_t *handle, struct inode *inode, u32 hash) { struct inode *ea_inode = NULL; uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) }; int err; if (inode->i_sb->s_root == NULL) { ext4_warning(inode->i_sb, "refuse to create EA inode when umounting"); WARN_ON(1); return ERR_PTR(-EINVAL); } /* * Let the next inode be the goal, so we try and allocate the EA inode * in the same group, or nearby one. */ ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, S_IFREG | 0600, NULL, inode->i_ino + 1, owner, EXT4_EA_INODE_FL); if (!IS_ERR(ea_inode)) { ea_inode->i_op = &ext4_file_inode_operations; ea_inode->i_fop = &ext4_file_operations; ext4_set_aops(ea_inode); ext4_xattr_inode_set_class(ea_inode); unlock_new_inode(ea_inode); ext4_xattr_inode_set_ref(ea_inode, 1); ext4_xattr_inode_set_hash(ea_inode, hash); err = ext4_mark_inode_dirty(handle, ea_inode); if (!err) err = ext4_inode_attach_jinode(ea_inode); if (err) { if (ext4_xattr_inode_dec_ref(handle, ea_inode)) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); iput(ea_inode); return ERR_PTR(err); } /* * Xattr inodes are shared therefore quota charging is performed * at a higher level. */ dquot_free_inode(ea_inode); dquot_drop(ea_inode); inode_lock(ea_inode); ea_inode->i_flags |= S_NOQUOTA; inode_unlock(ea_inode); } return ea_inode; } static struct inode * ext4_xattr_inode_cache_find(struct inode *inode, const void *value, size_t value_len, u32 hash) { struct inode *ea_inode; struct mb_cache_entry *ce; struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); void *ea_data; if (!ea_inode_cache) return NULL; ce = mb_cache_entry_find_first(ea_inode_cache, hash); if (!ce) return NULL; WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) && !(current->flags & PF_MEMALLOC_NOFS)); ea_data = kvmalloc(value_len, GFP_KERNEL); if (!ea_data) { mb_cache_entry_put(ea_inode_cache, ce); return NULL; } while (ce) { ea_inode = ext4_iget(inode->i_sb, ce->e_value, EXT4_IGET_EA_INODE); if (IS_ERR(ea_inode)) goto next_entry; ext4_xattr_inode_set_class(ea_inode); if (i_size_read(ea_inode) == value_len && !ext4_xattr_inode_read(ea_inode, ea_data, value_len) && !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data, value_len) && !memcmp(value, ea_data, value_len)) { mb_cache_entry_touch(ea_inode_cache, ce); mb_cache_entry_put(ea_inode_cache, ce); kvfree(ea_data); return ea_inode; } iput(ea_inode); next_entry: ce = mb_cache_entry_find_next(ea_inode_cache, ce); } kvfree(ea_data); return NULL; } /* * Add value of the EA in an inode. */ static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, const void *value, size_t value_len) { struct inode *ea_inode; u32 hash; int err; /* Account inode & space to quota even if sharing... */ err = ext4_xattr_inode_alloc_quota(inode, value_len); if (err) return ERR_PTR(err); hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash); if (ea_inode) { err = ext4_xattr_inode_inc_ref(handle, ea_inode); if (err) goto out_err; return ea_inode; } /* Create an inode for the EA value */ ea_inode = ext4_xattr_inode_create(handle, inode, hash); if (IS_ERR(ea_inode)) { ext4_xattr_inode_free_quota(inode, NULL, value_len); return ea_inode; } err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); if (err) { if (ext4_xattr_inode_dec_ref(handle, ea_inode)) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); goto out_err; } if (EA_INODE_CACHE(inode)) mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, ea_inode->i_ino, true /* reusable */); return ea_inode; out_err: iput(ea_inode); ext4_xattr_inode_free_quota(inode, NULL, value_len); return ERR_PTR(err); } /* * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode * feature is enabled. */ #define EXT4_XATTR_BLOCK_RESERVE(inode) min(i_blocksize(inode)/8, 1024U) static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, handle_t *handle, struct inode *inode, struct inode *new_ea_inode, bool is_block) { struct ext4_xattr_entry *last, *next; struct ext4_xattr_entry *here = s->here; size_t min_offs = s->end - s->base, name_len = strlen(i->name); int in_inode = i->in_inode; struct inode *old_ea_inode = NULL; size_t old_size, new_size; int ret; /* Space used by old and new values. */ old_size = (!s->not_found && !here->e_value_inum) ? EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0; new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0; /* * Optimization for the simple case when old and new values have the * same padded sizes. Not applicable if external inodes are involved. */ if (new_size && new_size == old_size) { size_t offs = le16_to_cpu(here->e_value_offs); void *val = s->base + offs; here->e_value_size = cpu_to_le32(i->value_len); if (i->value == EXT4_ZERO_XATTR_VALUE) { memset(val, 0, new_size); } else { memcpy(val, i->value, i->value_len); /* Clear padding bytes. */ memset(val + i->value_len, 0, new_size - i->value_len); } goto update_hash; } /* Compute min_offs and last. */ last = s->first; for (; !IS_LAST_ENTRY(last); last = next) { next = EXT4_XATTR_NEXT(last); if ((void *)next >= s->end) { EXT4_ERROR_INODE(inode, "corrupted xattr entries"); ret = -EFSCORRUPTED; goto out; } if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) min_offs = offs; } } /* Check whether we have enough space. */ if (i->value) { size_t free; free = min_offs - ((void *)last - s->base) - sizeof(__u32); if (!s->not_found) free += EXT4_XATTR_LEN(name_len) + old_size; if (free < EXT4_XATTR_LEN(name_len) + new_size) { ret = -ENOSPC; goto out; } /* * If storing the value in an external inode is an option, * reserve space for xattr entries/names in the external * attribute block so that a long value does not occupy the * whole space and prevent further entries being added. */ if (ext4_has_feature_ea_inode(inode->i_sb) && new_size && is_block && (min_offs + old_size - new_size) < EXT4_XATTR_BLOCK_RESERVE(inode)) { ret = -ENOSPC; goto out; } } /* * Getting access to old and new ea inodes is subject to failures. * Finish that work before doing any modifications to the xattr data. */ if (!s->not_found && here->e_value_inum) { ret = ext4_xattr_inode_iget(inode, le32_to_cpu(here->e_value_inum), le32_to_cpu(here->e_hash), &old_ea_inode); if (ret) { old_ea_inode = NULL; goto out; } /* We are ready to release ref count on the old_ea_inode. */ ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); if (ret) goto out; ext4_xattr_inode_free_quota(inode, old_ea_inode, le32_to_cpu(here->e_value_size)); } /* No failures allowed past this point. */ if (!s->not_found && here->e_value_size && !here->e_value_inum) { /* Remove the old value. */ void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(here->e_value_offs); void *val = s->base + offs; memmove(first_val + old_size, first_val, val - first_val); memset(first_val, 0, old_size); min_offs += old_size; /* Adjust all value offsets. */ last = s->first; while (!IS_LAST_ENTRY(last)) { size_t o = le16_to_cpu(last->e_value_offs); if (!last->e_value_inum && last->e_value_size && o < offs) last->e_value_offs = cpu_to_le16(o + old_size); last = EXT4_XATTR_NEXT(last); } } if (!i->value) { /* Remove old name. */ size_t size = EXT4_XATTR_LEN(name_len); last = ENTRY((void *)last - size); memmove(here, (void *)here + size, (void *)last - (void *)here + sizeof(__u32)); memset(last, 0, size); /* * Update i_inline_off - moved ibody region might contain * system.data attribute. Handling a failure here won't * cause other complications for setting an xattr. */ if (!is_block && ext4_has_inline_data(inode)) { ret = ext4_find_inline_data_nolock(inode); if (ret) { ext4_warning_inode(inode, "unable to update i_inline_off"); goto out; } } } else if (s->not_found) { /* Insert new name. */ size_t size = EXT4_XATTR_LEN(name_len); size_t rest = (void *)last - (void *)here + sizeof(__u32); memmove((void *)here + size, here, rest); memset(here, 0, size); here->e_name_index = i->name_index; here->e_name_len = name_len; memcpy(here->e_name, i->name, name_len); } else { /* This is an update, reset value info. */ here->e_value_inum = 0; here->e_value_offs = 0; here->e_value_size = 0; } if (i->value) { /* Insert new value. */ if (in_inode) { here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino); } else if (i->value_len) { void *val = s->base + min_offs - new_size; here->e_value_offs = cpu_to_le16(min_offs - new_size); if (i->value == EXT4_ZERO_XATTR_VALUE) { memset(val, 0, new_size); } else { memcpy(val, i->value, i->value_len); /* Clear padding bytes. */ memset(val + i->value_len, 0, new_size - i->value_len); } } here->e_value_size = cpu_to_le32(i->value_len); } update_hash: if (i->value) { __le32 hash = 0; /* Entry hash calculation. */ if (in_inode) { __le32 crc32c_hash; /* * Feed crc32c hash instead of the raw value for entry * hash calculation. This is to avoid walking * potentially long value buffer again. */ crc32c_hash = cpu_to_le32( ext4_xattr_inode_get_hash(new_ea_inode)); hash = ext4_xattr_hash_entry(here->e_name, here->e_name_len, &crc32c_hash, 1); } else if (is_block) { __le32 *value = s->base + le16_to_cpu( here->e_value_offs); hash = ext4_xattr_hash_entry(here->e_name, here->e_name_len, value, new_size >> 2); } here->e_hash = hash; } if (is_block) ext4_xattr_rehash((struct ext4_xattr_header *)s->base); ret = 0; out: iput(old_ea_inode); return ret; } struct ext4_xattr_block_find { struct ext4_xattr_search s; struct buffer_head *bh; }; static int ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_block_find *bs) { struct super_block *sb = inode->i_sb; int error; ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", i->name_index, i->name, i->value, (long)i->value_len); if (EXT4_I(inode)->i_file_acl) { /* The inode already has an extended attribute block. */ bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bs->bh)) { error = PTR_ERR(bs->bh); bs->bh = NULL; return error; } ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); error = ext4_xattr_check_block(inode, bs->bh); if (error) return error; /* Find the named attribute. */ bs->s.base = BHDR(bs->bh); bs->s.first = BFIRST(bs->bh); bs->s.end = bs->bh->b_data + bs->bh->b_size; bs->s.here = bs->s.first; error = xattr_find_entry(inode, &bs->s.here, bs->s.end, i->name_index, i->name, 1); if (error && error != -ENODATA) return error; bs->s.not_found = error; } return 0; } static int ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_block_find *bs) { struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; struct ext4_xattr_search s_copy = bs->s; struct ext4_xattr_search *s = &s_copy; struct mb_cache_entry *ce = NULL; int error = 0; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); struct inode *ea_inode = NULL, *tmp_inode; size_t old_ea_inode_quota = 0; unsigned int ea_ino; #define header(x) ((struct ext4_xattr_header *)(x)) /* If we need EA inode, prepare it before locking the buffer */ if (i->value && i->in_inode) { WARN_ON_ONCE(!i->value_len); ea_inode = ext4_xattr_inode_lookup_create(handle, inode, i->value, i->value_len); if (IS_ERR(ea_inode)) { error = PTR_ERR(ea_inode); ea_inode = NULL; goto cleanup; } } if (s->base) { int offset = (char *)s->here - bs->bh->b_data; BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, sb, bs->bh, EXT4_JTR_NONE); if (error) goto cleanup; lock_buffer(bs->bh); if (header(s->base)->h_refcount == cpu_to_le32(1)) { __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash); /* * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect modified * block */ if (ea_block_cache) { struct mb_cache_entry *oe; oe = mb_cache_entry_delete_or_get(ea_block_cache, hash, bs->bh->b_blocknr); if (oe) { /* * Xattr block is getting reused. Leave * it alone. */ mb_cache_entry_put(ea_block_cache, oe); goto clone_block; } } ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, true /* is_block */); ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) goto bad_block; if (!error) error = ext4_handle_dirty_metadata(handle, inode, bs->bh); if (error) goto cleanup; goto inserted; } clone_block: unlock_buffer(bs->bh); ea_bdebug(bs->bh, "cloning"); s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS); error = -ENOMEM; if (s->base == NULL) goto cleanup; s->first = ENTRY(header(s->base)+1); header(s->base)->h_refcount = cpu_to_le32(1); s->here = ENTRY(s->base + offset); s->end = s->base + bs->bh->b_size; /* * If existing entry points to an xattr inode, we need * to prevent ext4_xattr_set_entry() from decrementing * ref count on it because the reference belongs to the * original block. In this case, make the entry look * like it has an empty value. */ if (!s->not_found && s->here->e_value_inum) { ea_ino = le32_to_cpu(s->here->e_value_inum); error = ext4_xattr_inode_iget(inode, ea_ino, le32_to_cpu(s->here->e_hash), &tmp_inode); if (error) goto cleanup; if (!ext4_test_inode_state(tmp_inode, EXT4_STATE_LUSTRE_EA_INODE)) { /* * Defer quota free call for previous * inode until success is guaranteed. */ old_ea_inode_quota = le32_to_cpu( s->here->e_value_size); } iput(tmp_inode); s->here->e_value_inum = 0; s->here->e_value_size = 0; } } else { /* Allocate a buffer where we construct the new block. */ s->base = kzalloc(sb->s_blocksize, GFP_NOFS); error = -ENOMEM; if (s->base == NULL) goto cleanup; header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); header(s->base)->h_blocks = cpu_to_le32(1); header(s->base)->h_refcount = cpu_to_le32(1); s->first = ENTRY(header(s->base)+1); s->here = ENTRY(header(s->base)+1); s->end = s->base + sb->s_blocksize; } error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, true /* is_block */); if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; inserted: if (!IS_LAST_ENTRY(s->first)) { new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce); if (IS_ERR(new_bh)) { error = PTR_ERR(new_bh); new_bh = NULL; goto cleanup; } if (new_bh) { /* We found an identical block in the cache. */ if (new_bh == bs->bh) ea_bdebug(new_bh, "keeping"); else { u32 ref; #ifdef EXT4_XATTR_DEBUG WARN_ON_ONCE(dquot_initialize_needed(inode)); #endif /* The old block is released after updating the inode. */ error = dquot_alloc_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); if (error) goto cleanup; BUFFER_TRACE(new_bh, "get_write_access"); error = ext4_journal_get_write_access( handle, sb, new_bh, EXT4_JTR_NONE); if (error) goto cleanup_dquot; lock_buffer(new_bh); /* * We have to be careful about races with * adding references to xattr block. Once we * hold buffer lock xattr block's state is * stable so we can check the additional * reference fits. */ ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; if (ref > EXT4_XATTR_REFCOUNT_MAX) { /* * Undo everything and check mbcache * again. */ unlock_buffer(new_bh); dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); brelse(new_bh); mb_cache_entry_put(ea_block_cache, ce); ce = NULL; new_bh = NULL; goto inserted; } BHDR(new_bh)->h_refcount = cpu_to_le32(ref); if (ref == EXT4_XATTR_REFCOUNT_MAX) clear_bit(MBE_REUSABLE_B, &ce->e_flags); ea_bdebug(new_bh, "reusing; refcount now=%d", ref); ext4_xattr_block_csum_set(inode, new_bh); unlock_buffer(new_bh); error = ext4_handle_dirty_metadata(handle, inode, new_bh); if (error) goto cleanup_dquot; } mb_cache_entry_touch(ea_block_cache, ce); mb_cache_entry_put(ea_block_cache, ce); ce = NULL; } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ ea_bdebug(bs->bh, "keeping this block"); ext4_xattr_block_cache_insert(ea_block_cache, bs->bh); new_bh = bs->bh; get_bh(new_bh); } else { /* We need to allocate a new block */ ext4_fsblk_t goal, block; #ifdef EXT4_XATTR_DEBUG WARN_ON_ONCE(dquot_initialize_needed(inode)); #endif goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); if (error) goto cleanup; ea_idebug(inode, "creating block %llu", (unsigned long long)block); new_bh = sb_getblk(sb, block); if (unlikely(!new_bh)) { error = -ENOMEM; getblk_failed: ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA); goto cleanup; } error = ext4_xattr_inode_inc_ref_all(handle, inode, ENTRY(header(s->base)+1)); if (error) goto getblk_failed; if (ea_inode) { /* Drop the extra ref on ea_inode. */ error = ext4_xattr_inode_dec_ref(handle, ea_inode); if (error) ext4_warning_inode(ea_inode, "dec ref error=%d", error); iput(ea_inode); ea_inode = NULL; } lock_buffer(new_bh); error = ext4_journal_get_create_access(handle, sb, new_bh, EXT4_JTR_NONE); if (error) { unlock_buffer(new_bh); error = -EIO; goto getblk_failed; } memcpy(new_bh->b_data, s->base, new_bh->b_size); ext4_xattr_block_csum_set(inode, new_bh); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); ext4_xattr_block_cache_insert(ea_block_cache, new_bh); error = ext4_handle_dirty_metadata(handle, inode, new_bh); if (error) goto cleanup; } } if (old_ea_inode_quota) ext4_xattr_inode_free_quota(inode, NULL, old_ea_inode_quota); /* Update the inode. */ EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; /* Drop the previous xattr block. */ if (bs->bh && bs->bh != new_bh) { struct ext4_xattr_inode_array *ea_inode_array = NULL; ext4_xattr_release_block(handle, inode, bs->bh, &ea_inode_array, 0 /* extra_credits */); ext4_xattr_inode_array_free(ea_inode_array); } error = 0; cleanup: if (ea_inode) { if (error) { int error2; error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); if (error2) ext4_warning_inode(ea_inode, "dec ref error=%d", error2); ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); } iput(ea_inode); } if (ce) mb_cache_entry_put(ea_block_cache, ce); brelse(new_bh); if (!(bs->bh && s->base == bs->bh->b_data)) kfree(s->base); return error; cleanup_dquot: dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); goto cleanup; bad_block: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); goto cleanup; #undef header } int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; int error; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return 0; raw_inode = ext4_raw_inode(&is->iloc); header = IHDR(inode, raw_inode); is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; is->s.end = ITAIL(inode, raw_inode); if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { /* Find the named attribute. */ error = xattr_find_entry(inode, &is->s.here, is->s.end, i->name_index, i->name, 0); if (error && error != -ENODATA) return error; is->s.not_found = error; } return 0; } int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; struct inode *ea_inode = NULL; int error; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return -ENOSPC; /* If we need EA inode, prepare it before locking the buffer */ if (i->value && i->in_inode) { WARN_ON_ONCE(!i->value_len); ea_inode = ext4_xattr_inode_lookup_create(handle, inode, i->value, i->value_len); if (IS_ERR(ea_inode)) return PTR_ERR(ea_inode); } error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, false /* is_block */); if (error) { if (ea_inode) { int error2; error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); if (error2) ext4_warning_inode(ea_inode, "dec ref error=%d", error2); ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); iput(ea_inode); } return error; } header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); ext4_set_inode_state(inode, EXT4_STATE_XATTR); } else { header->h_magic = cpu_to_le32(0); ext4_clear_inode_state(inode, EXT4_STATE_XATTR); } iput(ea_inode); return 0; } static int ext4_xattr_value_same(struct ext4_xattr_search *s, struct ext4_xattr_info *i) { void *value; /* When e_value_inum is set the value is stored externally. */ if (s->here->e_value_inum) return 0; if (le32_to_cpu(s->here->e_value_size) != i->value_len) return 0; value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs); return !memcmp(value, i->value, i->value_len); } static struct buffer_head *ext4_xattr_get_block(struct inode *inode) { struct buffer_head *bh; int error; if (!EXT4_I(inode)->i_file_acl) return NULL; bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) return bh; error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); return ERR_PTR(error); } return bh; } /* * ext4_xattr_set_handle() * * Create, replace or remove an extended attribute for this inode. Value * is NULL to remove an existing extended attribute, and non-NULL to * either replace an existing extended attribute, or create a new extended * attribute. The flags XATTR_REPLACE and XATTR_CREATE * specify that an extended attribute must exist and must not exist * previous to the call, respectively. * * Returns 0, or a negative error number on failure. */ int ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, const char *name, const void *value, size_t value_len, int flags) { struct ext4_xattr_info i = { .name_index = name_index, .name = name, .value = value, .value_len = value_len, .in_inode = 0, }; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_block_find bs = { .s = { .not_found = -ENODATA, }, }; int no_expand; int error; if (!name) return -EINVAL; if (strlen(name) > 255) return -ERANGE; ext4_write_lock_xattr(inode, &no_expand); /* Check journal credits under write lock. */ if (ext4_handle_valid(handle)) { struct buffer_head *bh; int credits; bh = ext4_xattr_get_block(inode); if (IS_ERR(bh)) { error = PTR_ERR(bh); goto cleanup; } credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, value_len, flags & XATTR_CREATE); brelse(bh); if (jbd2_handle_buffer_credits(handle) < credits) { error = -ENOSPC; goto cleanup; } WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); } error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) goto cleanup; if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) { struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); ext4_clear_inode_state(inode, EXT4_STATE_NEW); } error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto cleanup; if (is.s.not_found) error = ext4_xattr_block_find(inode, &i, &bs); if (error) goto cleanup; if (is.s.not_found && bs.s.not_found) { error = -ENODATA; if (flags & XATTR_REPLACE) goto cleanup; error = 0; if (!value) goto cleanup; } else { error = -EEXIST; if (flags & XATTR_CREATE) goto cleanup; } if (!value) { if (!is.s.not_found) error = ext4_xattr_ibody_set(handle, inode, &i, &is); else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { error = 0; /* Xattr value did not change? Save us some work and bail out */ if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i)) goto cleanup; if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) goto cleanup; if (ext4_has_feature_ea_inode(inode->i_sb) && (EXT4_XATTR_SIZE(i.value_len) > EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) i.in_inode = 1; retry_inode: error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; error = ext4_xattr_block_set(handle, inode, &i, &bs); } else if (error == -ENOSPC) { if (EXT4_I(inode)->i_file_acl && !bs.s.base) { brelse(bs.bh); bs.bh = NULL; error = ext4_xattr_block_find(inode, &i, &bs); if (error) goto cleanup; } error = ext4_xattr_block_set(handle, inode, &i, &bs); if (!error && !is.s.not_found) { i.value = NULL; error = ext4_xattr_ibody_set(handle, inode, &i, &is); } else if (error == -ENOSPC) { /* * Xattr does not fit in the block, store at * external inode if possible. */ if (ext4_has_feature_ea_inode(inode->i_sb) && i.value_len && !i.in_inode) { i.in_inode = 1; goto retry_inode; } } } } if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); inode_set_ctime_current(inode); inode_inc_iversion(inode); if (!value) no_expand = 0; error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); /* * The bh is consumed by ext4_mark_iloc_dirty, even with * error != 0. */ is.iloc.bh = NULL; if (IS_SYNC(inode)) ext4_handle_sync(handle); } ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); cleanup: brelse(is.iloc.bh); brelse(bs.bh); ext4_write_unlock_xattr(inode, &no_expand); return error; } int ext4_xattr_set_credits(struct inode *inode, size_t value_len, bool is_create, int *credits) { struct buffer_head *bh; int err; *credits = 0; if (!EXT4_SB(inode->i_sb)->s_journal) return 0; down_read(&EXT4_I(inode)->xattr_sem); bh = ext4_xattr_get_block(inode); if (IS_ERR(bh)) { err = PTR_ERR(bh); } else { *credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, value_len, is_create); brelse(bh); err = 0; } up_read(&EXT4_I(inode)->xattr_sem); return err; } /* * ext4_xattr_set() * * Like ext4_xattr_set_handle, but start from an inode. This extended * attribute modification is a filesystem transaction by itself. * * Returns 0, or a negative error number on failure. */ int ext4_xattr_set(struct inode *inode, int name_index, const char *name, const void *value, size_t value_len, int flags) { handle_t *handle; struct super_block *sb = inode->i_sb; int error, retries = 0; int credits; error = dquot_initialize(inode); if (error) return error; retry: error = ext4_xattr_set_credits(inode, value_len, flags & XATTR_CREATE, &credits); if (error) return error; handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); } else { int error2; error = ext4_xattr_set_handle(handle, inode, name_index, name, value, value_len, flags); ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); error2 = ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) goto retry; if (error == 0) error = error2; } return error; } /* * Shift the EA entries in the inode to create space for the increased * i_extra_isize. */ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, int value_offs_shift, void *to, void *from, size_t n) { struct ext4_xattr_entry *last = entry; int new_offs; /* We always shift xattr headers further thus offsets get lower */ BUG_ON(value_offs_shift > 0); /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { if (!last->e_value_inum && last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; last->e_value_offs = cpu_to_le16(new_offs); } } /* Shift the entries by n bytes */ memmove(to, from, n); } /* * Move xattr pointed to by 'entry' from inode into external xattr block */ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, struct ext4_inode *raw_inode, struct ext4_xattr_entry *entry) { struct ext4_xattr_ibody_find *is = NULL; struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; size_t value_size = le32_to_cpu(entry->e_value_size); struct ext4_xattr_info i = { .value = NULL, .value_len = 0, .name_index = entry->e_name_index, .in_inode = !!entry->e_value_inum, }; struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); int needs_kvfree = 0; int error; is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); if (!is || !bs || !b_entry_name) { error = -ENOMEM; goto out; } is->s.not_found = -ENODATA; bs->s.not_found = -ENODATA; is->iloc.bh = NULL; bs->bh = NULL; /* Save the entry name and the entry value */ if (entry->e_value_inum) { buffer = kvmalloc(value_size, GFP_NOFS); if (!buffer) { error = -ENOMEM; goto out; } needs_kvfree = 1; error = ext4_xattr_inode_get(inode, entry, buffer, value_size); if (error) goto out; } else { size_t value_offs = le16_to_cpu(entry->e_value_offs); buffer = (void *)IFIRST(header) + value_offs; } memcpy(b_entry_name, entry->e_name, entry->e_name_len); b_entry_name[entry->e_name_len] = '\0'; i.name = b_entry_name; error = ext4_get_inode_loc(inode, &is->iloc); if (error) goto out; error = ext4_xattr_ibody_find(inode, &i, is); if (error) goto out; i.value = buffer; i.value_len = value_size; error = ext4_xattr_block_find(inode, &i, bs); if (error) goto out; /* Move ea entry from the inode into the block */ error = ext4_xattr_block_set(handle, inode, &i, bs); if (error) goto out; /* Remove the chosen entry from the inode */ i.value = NULL; i.value_len = 0; error = ext4_xattr_ibody_set(handle, inode, &i, is); out: kfree(b_entry_name); if (needs_kvfree && buffer) kvfree(buffer); if (is) brelse(is->iloc.bh); if (bs) brelse(bs->bh); kfree(is); kfree(bs); return error; } static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, struct ext4_inode *raw_inode, int isize_diff, size_t ifree, size_t bfree, int *total_ino) { struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); struct ext4_xattr_entry *small_entry; struct ext4_xattr_entry *entry; struct ext4_xattr_entry *last; unsigned int entry_size; /* EA entry size */ unsigned int total_size; /* EA entry size + value size */ unsigned int min_total_size; int error; while (isize_diff > ifree) { entry = NULL; small_entry = NULL; min_total_size = ~0U; last = IFIRST(header); /* Find the entry best suited to be pushed into EA block */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { /* never move system.data out of the inode */ if ((last->e_name_len == 4) && (last->e_name_index == EXT4_XATTR_INDEX_SYSTEM) && !memcmp(last->e_name, "data", 4)) continue; total_size = EXT4_XATTR_LEN(last->e_name_len); if (!last->e_value_inum) total_size += EXT4_XATTR_SIZE( le32_to_cpu(last->e_value_size)); if (total_size <= bfree && total_size < min_total_size) { if (total_size + ifree < isize_diff) { small_entry = last; } else { entry = last; min_total_size = total_size; } } } if (entry == NULL) { if (small_entry == NULL) return -ENOSPC; entry = small_entry; } entry_size = EXT4_XATTR_LEN(entry->e_name_len); total_size = entry_size; if (!entry->e_value_inum) total_size += EXT4_XATTR_SIZE( le32_to_cpu(entry->e_value_size)); error = ext4_xattr_move_to_block(handle, inode, raw_inode, entry); if (error) return error; *total_ino -= entry_size; ifree += total_size; bfree -= total_size; } return 0; } /* * Expand an inode by new_extra_isize bytes when EAs are present. * Returns 0 on success or negative error number on failure. */ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle) { struct ext4_xattr_ibody_header *header; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); static unsigned int mnt_count; size_t min_offs; size_t ifree, bfree; int total_ino; void *base, *end; int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(sbi->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ retry: isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) return 0; header = IHDR(inode, raw_inode); /* * Check if enough free space is available in the inode to shift the * entries ahead by new_extra_isize. */ base = IFIRST(header); end = ITAIL(inode, raw_inode); min_offs = end - base; total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); if (ifree >= isize_diff) goto shift; /* * Enough free space isn't available in the inode, check if * EA block can hold new_extra_isize bytes. */ if (EXT4_I(inode)->i_file_acl) { struct buffer_head *bh; bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { error = PTR_ERR(bh); goto cleanup; } error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); goto cleanup; } base = BHDR(bh); end = bh->b_data + bh->b_size; min_offs = end - base; bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base, NULL); brelse(bh); if (bfree + ifree < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; goto retry; } error = -ENOSPC; goto cleanup; } } else { bfree = inode->i_sb->s_blocksize; } error = ext4_xattr_make_inode_space(handle, inode, raw_inode, isize_diff, ifree, bfree, &total_ino); if (error) { if (error == -ENOSPC && !tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; goto retry; } goto cleanup; } shift: /* Adjust the offsets and shift the remaining entries ahead */ ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize - new_extra_isize, (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, (void *)header, total_ino); EXT4_I(inode)->i_extra_isize = new_extra_isize; if (ext4_has_inline_data(inode)) error = ext4_find_inline_data_nolock(inode); cleanup: if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) { ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.", inode->i_ino); mnt_count = le16_to_cpu(sbi->s_es->s_mnt_count); } return error; } #define EIA_INCR 16 /* must be 2^n */ #define EIA_MASK (EIA_INCR - 1) /* Add the large xattr @inode into @ea_inode_array for deferred iput(). * If @ea_inode_array is new or full it will be grown and the old * contents copied over. */ static int ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, struct inode *inode) { if (*ea_inode_array == NULL) { /* * Start with 15 inodes, so it fits into a power-of-two size. */ (*ea_inode_array) = kmalloc( struct_size(*ea_inode_array, inodes, EIA_MASK), GFP_NOFS); if (*ea_inode_array == NULL) return -ENOMEM; (*ea_inode_array)->count = 0; } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) { /* expand the array once all 15 + n * 16 slots are full */ struct ext4_xattr_inode_array *new_array = NULL; new_array = kmalloc( struct_size(*ea_inode_array, inodes, (*ea_inode_array)->count + EIA_INCR), GFP_NOFS); if (new_array == NULL) return -ENOMEM; memcpy(new_array, *ea_inode_array, struct_size(*ea_inode_array, inodes, (*ea_inode_array)->count)); kfree(*ea_inode_array); *ea_inode_array = new_array; } (*ea_inode_array)->count++; (*ea_inode_array)->inodes[(*ea_inode_array)->count - 1] = inode; return 0; } /* * ext4_xattr_delete_inode() * * Free extended attribute resources associated with this inode. Traverse * all entries and decrement reference on any xattr inodes associated with this * inode. This is called immediately before an inode is freed. We have exclusive * access to the inode. If an orphan inode is deleted it will also release its * references on xattr block and xattr inodes. */ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_inode_array **ea_inode_array, int extra_credits) { struct buffer_head *bh = NULL; struct ext4_xattr_ibody_header *header; struct ext4_iloc iloc = { .bh = NULL }; struct ext4_xattr_entry *entry; struct inode *ea_inode; int error; error = ext4_journal_ensure_credits(handle, extra_credits, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (error < 0) { EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error); goto cleanup; } if (ext4_has_feature_ea_inode(inode->i_sb) && ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { error = ext4_get_inode_loc(inode, &iloc); if (error) { EXT4_ERROR_INODE(inode, "inode loc (error %d)", error); goto cleanup; } error = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); if (error) { EXT4_ERROR_INODE(inode, "write access (error %d)", error); goto cleanup; } header = IHDR(inode, ext4_raw_inode(&iloc)); if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC)) ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh, IFIRST(header), false /* block_csum */, ea_inode_array, extra_credits, false /* skip_quota */); } if (EXT4_I(inode)->i_file_acl) { bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { error = PTR_ERR(bh); if (error == -EIO) { EXT4_ERROR_INODE_ERR(inode, EIO, "block %llu read error", EXT4_I(inode)->i_file_acl); } bh = NULL; goto cleanup; } error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; if (ext4_has_feature_ea_inode(inode->i_sb)) { for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; error = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), le32_to_cpu(entry->e_hash), &ea_inode); if (error) continue; ext4_xattr_inode_free_quota(inode, ea_inode, le32_to_cpu(entry->e_value_size)); iput(ea_inode); } } ext4_xattr_release_block(handle, inode, bh, ea_inode_array, extra_credits); /* * Update i_file_acl value in the same transaction that releases * block. */ EXT4_I(inode)->i_file_acl = 0; error = ext4_mark_inode_dirty(handle, inode); if (error) { EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)", error); goto cleanup; } ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); } error = 0; cleanup: brelse(iloc.bh); brelse(bh); return error; } void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) { int idx; if (ea_inode_array == NULL) return; for (idx = 0; idx < ea_inode_array->count; ++idx) iput(ea_inode_array->inodes[idx]); kfree(ea_inode_array); } /* * ext4_xattr_block_cache_insert() * * Create a new entry in the extended attribute block cache, and insert * it unless such an entry is already in the cache. */ static void ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, struct buffer_head *bh) { struct ext4_xattr_header *header = BHDR(bh); __u32 hash = le32_to_cpu(header->h_hash); int reusable = le32_to_cpu(header->h_refcount) < EXT4_XATTR_REFCOUNT_MAX; int error; if (!ea_block_cache) return; error = mb_cache_entry_create(ea_block_cache, GFP_NOFS, hash, bh->b_blocknr, reusable); if (error) { if (error == -EBUSY) ea_bdebug(bh, "already in cache"); } else ea_bdebug(bh, "inserting [%x]", (int)hash); } /* * ext4_xattr_cmp() * * Compare two extended attribute blocks for equality. * * Returns 0 if the blocks are equal, 1 if they differ. */ static int ext4_xattr_cmp(struct ext4_xattr_header *header1, struct ext4_xattr_header *header2) { struct ext4_xattr_entry *entry1, *entry2; entry1 = ENTRY(header1+1); entry2 = ENTRY(header2+1); while (!IS_LAST_ENTRY(entry1)) { if (IS_LAST_ENTRY(entry2)) return 1; if (entry1->e_hash != entry2->e_hash || entry1->e_name_index != entry2->e_name_index || entry1->e_name_len != entry2->e_name_len || entry1->e_value_size != entry2->e_value_size || entry1->e_value_inum != entry2->e_value_inum || memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) return 1; if (!entry1->e_value_inum && memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), (char *)header2 + le16_to_cpu(entry2->e_value_offs), le32_to_cpu(entry1->e_value_size))) return 1; entry1 = EXT4_XATTR_NEXT(entry1); entry2 = EXT4_XATTR_NEXT(entry2); } if (!IS_LAST_ENTRY(entry2)) return 1; return 0; } /* * ext4_xattr_block_cache_find() * * Find an identical extended attribute block. * * Returns a pointer to the block found, or NULL if such a block was not * found, or an error pointer if an error occurred while reading ea block. */ static struct buffer_head * ext4_xattr_block_cache_find(struct inode *inode, struct ext4_xattr_header *header, struct mb_cache_entry **pce) { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); if (!ea_block_cache) return NULL; if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ce = mb_cache_entry_find_first(ea_block_cache, hash); while (ce) { struct buffer_head *bh; bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); if (IS_ERR(bh)) { if (PTR_ERR(bh) != -ENOMEM) EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long)ce->e_value); mb_cache_entry_put(ea_block_cache, ce); return bh; } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; } brelse(bh); ce = mb_cache_entry_find_next(ea_block_cache, ce); } return NULL; } #define NAME_HASH_SHIFT 5 #define VALUE_HASH_SHIFT 16 /* * ext4_xattr_hash_entry() * * Compute the hash of an extended attribute. */ static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, size_t value_count) { __u32 hash = 0; while (name_len--) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ (unsigned char)*name++; } while (value_count--) { hash = (hash << VALUE_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ le32_to_cpu(*value++); } return cpu_to_le32(hash); } /* * ext4_xattr_hash_entry_signed() * * Compute the hash of an extended attribute incorrectly. */ static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value, size_t value_count) { __u32 hash = 0; while (name_len--) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ (signed char)*name++; } while (value_count--) { hash = (hash << VALUE_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ le32_to_cpu(*value++); } return cpu_to_le32(hash); } #undef NAME_HASH_SHIFT #undef VALUE_HASH_SHIFT #define BLOCK_HASH_SHIFT 16 /* * ext4_xattr_rehash() * * Re-compute the extended attribute hash value after an entry has changed. */ static void ext4_xattr_rehash(struct ext4_xattr_header *header) { struct ext4_xattr_entry *here; __u32 hash = 0; here = ENTRY(header+1); while (!IS_LAST_ENTRY(here)) { if (!here->e_hash) { /* Block is not shared if an entry's hash value == 0 */ hash = 0; break; } hash = (hash << BLOCK_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ le32_to_cpu(here->e_hash); here = EXT4_XATTR_NEXT(here); } header->h_hash = cpu_to_le32(hash); } #undef BLOCK_HASH_SHIFT #define HASH_BUCKET_BITS 10 struct mb_cache * ext4_xattr_create_cache(void) { return mb_cache_create(HASH_BUCKET_BITS); } void ext4_xattr_destroy_cache(struct mb_cache *cache) { if (cache) mb_cache_destroy(cache); }
28 39 14 13 14 2121 7 2110 36 36 28 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 // SPDX-License-Identifier: GPL-2.0-only /* * The "user cache". * * (C) Copyright 1991-2000 Linus Torvalds * * We have a per-user structure to keep track of how many * processes, files etc the user has claimed, in order to be * able to have per-user limits for system resources. */ #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/key.h> #include <linux/sched/user.h> #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/binfmts.h> #include <linux/proc_ns.h> #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc init_binfmt_misc = { .entries = LIST_HEAD_INIT(init_binfmt_misc.entries), .enabled = true, .entries_lock = __RW_LOCK_UNLOCKED(init_binfmt_misc.entries_lock), }; EXPORT_SYMBOL_GPL(init_binfmt_misc); #endif /* * userns count is 1 for root user, 1 for init_uts_ns, * and 1 for... ? */ struct user_namespace init_user_ns = { .uid_map = { { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, .nr_extents = 1, }, }, .gid_map = { { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, .nr_extents = 1, }, }, .projid_map = { { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, .nr_extents = 1, }, }, .ns.count = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .ns.inum = PROC_USER_INIT_INO, #ifdef CONFIG_USER_NS .ns.ops = &userns_operations, #endif .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_KEYS .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem), #endif #if IS_ENABLED(CONFIG_BINFMT_MISC) .binfmt_misc = &init_binfmt_misc, #endif }; EXPORT_SYMBOL_GPL(init_user_ns); /* * UID task count cache, to get fast user lookup in "alloc_uid" * when changing user ID's (ie setuid() and friends). */ #define UIDHASH_BITS (IS_ENABLED(CONFIG_BASE_SMALL) ? 3 : 7) #define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; static struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is * occasionally also taken from softirq/tasklet context, when * task-structs get RCU-freed. Hence all locking must be softirq-safe. * But free_uid() is also called with local interrupts disabled, and running * local_bh_enable() with local interrupts disabled is an error - we'll run * softirq callbacks, and they can unconditionally enable interrupts, and * the caller of free_uid() didn't expect that.. */ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), .uid = GLOBAL_ROOT_UID, .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), }; /* * These routines must be called with the uidhash spinlock held! */ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) { hlist_add_head(&up->uidhash_node, hashent); } static void uid_hash_remove(struct user_struct *up) { hlist_del_init(&up->uidhash_node); } static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) { struct user_struct *user; hlist_for_each_entry(user, hashent, uidhash_node) { if (uid_eq(user->uid, uid)) { refcount_inc(&user->__count); return user; } } return NULL; } static int user_epoll_alloc(struct user_struct *up) { #ifdef CONFIG_EPOLL return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL); #else return 0; #endif } static void user_epoll_free(struct user_struct *up) { #ifdef CONFIG_EPOLL percpu_counter_destroy(&up->epoll_watches); #endif } /* IRQs are disabled and uidhash_lock is held upon function entry. * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. */ static void free_user(struct user_struct *up, unsigned long flags) __releases(&uidhash_lock) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); user_epoll_free(up); kmem_cache_free(uid_cachep, up); } /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). * * If the user_struct could not be found, return NULL. */ struct user_struct *find_user(kuid_t uid) { struct user_struct *ret; unsigned long flags; spin_lock_irqsave(&uidhash_lock, flags); ret = uid_hash_find(uid, uidhashentry(uid)); spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } void free_uid(struct user_struct *up) { unsigned long flags; if (!up) return; if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags)) free_user(up, flags); } EXPORT_SYMBOL_GPL(free_uid); struct user_struct *alloc_uid(kuid_t uid) { struct hlist_head *hashent = uidhashentry(uid); struct user_struct *up, *new; spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); spin_unlock_irq(&uidhash_lock); if (!up) { new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); if (!new) return NULL; new->uid = uid; refcount_set(&new->__count, 1); if (user_epoll_alloc(new)) { kmem_cache_free(uid_cachep, new); return NULL; } ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); /* * Before adding this, check whether we raced * on adding the same user already.. */ spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { user_epoll_free(new); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); up = new; } spin_unlock_irq(&uidhash_lock); } return up; } static int __init uid_cache_init(void) { int n; uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(uidhash_table + n); if (user_epoll_alloc(&root_user)) panic("root_user epoll percpu counter alloc failed"); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); spin_unlock_irq(&uidhash_lock); return 0; } subsys_initcall(uid_cache_init);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 // SPDX-License-Identifier: GPL-2.0+ /* * Driver for Lexar "Jumpshot" Compact Flash reader * * jumpshot driver v0.1: * * First release * * Current development and maintenance by: * (c) 2000 Jimmie Mayfield (mayfield+usb@sackheads.org) * * Many thanks to Robert Baruch for the SanDisk SmartMedia reader driver * which I used as a template for this driver. * * Some bugfixes and scatter-gather code by Gregory P. Smith * (greg-usb@electricrain.com) * * Fix for media change by Joerg Schneider (js@joergschneider.com) * * Developed with the assistance of: * * (C) 2002 Alan Stern <stern@rowland.org> */ /* * This driver attempts to support the Lexar Jumpshot USB CompactFlash * reader. Like many other USB CompactFlash readers, the Jumpshot contains * a USB-to-ATA chip. * * This driver supports reading and writing. If you're truly paranoid, * however, you can force the driver into a write-protected state by setting * the WP enable bits in jumpshot_handle_mode_sense. See the comments * in that routine. */ #include <linux/errno.h> #include <linux/module.h> #include <linux/slab.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include "usb.h" #include "transport.h" #include "protocol.h" #include "debug.h" #include "scsiglue.h" #define DRV_NAME "ums-jumpshot" MODULE_DESCRIPTION("Driver for Lexar \"Jumpshot\" Compact Flash reader"); MODULE_AUTHOR("Jimmie Mayfield <mayfield+usb@sackheads.org>"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS("USB_STORAGE"); /* * The table of devices */ #define UNUSUAL_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \ vendorName, productName, useProtocol, useTransport, \ initFunction, flags) \ { USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax), \ .driver_info = (flags) } static const struct usb_device_id jumpshot_usb_ids[] = { # include "unusual_jumpshot.h" { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, jumpshot_usb_ids); #undef UNUSUAL_DEV /* * The flags table */ #define UNUSUAL_DEV(idVendor, idProduct, bcdDeviceMin, bcdDeviceMax, \ vendor_name, product_name, use_protocol, use_transport, \ init_function, Flags) \ { \ .vendorName = vendor_name, \ .productName = product_name, \ .useProtocol = use_protocol, \ .useTransport = use_transport, \ .initFunction = init_function, \ } static const struct us_unusual_dev jumpshot_unusual_dev_list[] = { # include "unusual_jumpshot.h" { } /* Terminating entry */ }; #undef UNUSUAL_DEV struct jumpshot_info { unsigned long sectors; /* total sector count */ unsigned long ssize; /* sector size in bytes */ /* the following aren't used yet */ unsigned char sense_key; unsigned long sense_asc; /* additional sense code */ unsigned long sense_ascq; /* additional sense code qualifier */ }; static inline int jumpshot_bulk_read(struct us_data *us, unsigned char *data, unsigned int len) { if (len == 0) return USB_STOR_XFER_GOOD; usb_stor_dbg(us, "len = %d\n", len); return usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, data, len, NULL); } static inline int jumpshot_bulk_write(struct us_data *us, unsigned char *data, unsigned int len) { if (len == 0) return USB_STOR_XFER_GOOD; usb_stor_dbg(us, "len = %d\n", len); return usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, data, len, NULL); } static int jumpshot_get_status(struct us_data *us) { int rc; if (!us) return USB_STOR_TRANSPORT_ERROR; // send the setup rc = usb_stor_ctrl_transfer(us, us->recv_ctrl_pipe, 0, 0xA0, 0, 7, us->iobuf, 1); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (us->iobuf[0] != 0x50) { usb_stor_dbg(us, "0x%2x\n", us->iobuf[0]); return USB_STOR_TRANSPORT_ERROR; } return USB_STOR_TRANSPORT_GOOD; } static int jumpshot_read_data(struct us_data *us, struct jumpshot_info *info, u32 sector, u32 sectors) { unsigned char *command = us->iobuf; unsigned char *buffer; unsigned char thistime; unsigned int totallen, alloclen; int len, result; unsigned int sg_offset = 0; struct scatterlist *sg = NULL; // we're working in LBA mode. according to the ATA spec, // we can support up to 28-bit addressing. I don't know if Jumpshot // supports beyond 24-bit addressing. It's kind of hard to test // since it requires > 8GB CF card. if (sector > 0x0FFFFFFF) return USB_STOR_TRANSPORT_ERROR; totallen = sectors * info->ssize; // Since we don't read more than 64 KB at a time, we have to create // a bounce buffer and move the data a piece at a time between the // bounce buffer and the actual transfer buffer. alloclen = min(totallen, 65536u); buffer = kmalloc(alloclen, GFP_NOIO); if (buffer == NULL) return USB_STOR_TRANSPORT_ERROR; do { // loop, never allocate or transfer more than 64k at once // (min(128k, 255*info->ssize) is the real limit) len = min(totallen, alloclen); thistime = (len / info->ssize) & 0xff; command[0] = 0; command[1] = thistime; command[2] = sector & 0xFF; command[3] = (sector >> 8) & 0xFF; command[4] = (sector >> 16) & 0xFF; command[5] = 0xE0 | ((sector >> 24) & 0x0F); command[6] = 0x20; // send the setup + command result = usb_stor_ctrl_transfer(us, us->send_ctrl_pipe, 0, 0x20, 0, 1, command, 7); if (result != USB_STOR_XFER_GOOD) goto leave; // read the result result = jumpshot_bulk_read(us, buffer, len); if (result != USB_STOR_XFER_GOOD) goto leave; usb_stor_dbg(us, "%d bytes\n", len); // Store the data in the transfer buffer usb_stor_access_xfer_buf(buffer, len, us->srb, &sg, &sg_offset, TO_XFER_BUF); sector += thistime; totallen -= len; } while (totallen > 0); kfree(buffer); return USB_STOR_TRANSPORT_GOOD; leave: kfree(buffer); return USB_STOR_TRANSPORT_ERROR; } static int jumpshot_write_data(struct us_data *us, struct jumpshot_info *info, u32 sector, u32 sectors) { unsigned char *command = us->iobuf; unsigned char *buffer; unsigned char thistime; unsigned int totallen, alloclen; int len, result, waitcount; unsigned int sg_offset = 0; struct scatterlist *sg = NULL; // we're working in LBA mode. according to the ATA spec, // we can support up to 28-bit addressing. I don't know if Jumpshot // supports beyond 24-bit addressing. It's kind of hard to test // since it requires > 8GB CF card. // if (sector > 0x0FFFFFFF) return USB_STOR_TRANSPORT_ERROR; totallen = sectors * info->ssize; // Since we don't write more than 64 KB at a time, we have to create // a bounce buffer and move the data a piece at a time between the // bounce buffer and the actual transfer buffer. alloclen = min(totallen, 65536u); buffer = kmalloc(alloclen, GFP_NOIO); if (buffer == NULL) return USB_STOR_TRANSPORT_ERROR; do { // loop, never allocate or transfer more than 64k at once // (min(128k, 255*info->ssize) is the real limit) len = min(totallen, alloclen); thistime = (len / info->ssize) & 0xff; // Get the data from the transfer buffer usb_stor_access_xfer_buf(buffer, len, us->srb, &sg, &sg_offset, FROM_XFER_BUF); command[0] = 0; command[1] = thistime; command[2] = sector & 0xFF; command[3] = (sector >> 8) & 0xFF; command[4] = (sector >> 16) & 0xFF; command[5] = 0xE0 | ((sector >> 24) & 0x0F); command[6] = 0x30; // send the setup + command result = usb_stor_ctrl_transfer(us, us->send_ctrl_pipe, 0, 0x20, 0, 1, command, 7); if (result != USB_STOR_XFER_GOOD) goto leave; // send the data result = jumpshot_bulk_write(us, buffer, len); if (result != USB_STOR_XFER_GOOD) goto leave; // read the result. apparently the bulk write can complete // before the jumpshot drive is finished writing. so we loop // here until we get a good return code waitcount = 0; do { result = jumpshot_get_status(us); if (result != USB_STOR_TRANSPORT_GOOD) { // I have not experimented to find the smallest value. // msleep(50); } } while ((result != USB_STOR_TRANSPORT_GOOD) && (waitcount < 10)); if (result != USB_STOR_TRANSPORT_GOOD) usb_stor_dbg(us, "Gah! Waitcount = 10. Bad write!?\n"); sector += thistime; totallen -= len; } while (totallen > 0); kfree(buffer); return result; leave: kfree(buffer); return USB_STOR_TRANSPORT_ERROR; } static int jumpshot_id_device(struct us_data *us, struct jumpshot_info *info) { unsigned char *command = us->iobuf; unsigned char *reply; int rc; if (!info) return USB_STOR_TRANSPORT_ERROR; command[0] = 0xE0; command[1] = 0xEC; reply = kmalloc(512, GFP_NOIO); if (!reply) return USB_STOR_TRANSPORT_ERROR; // send the setup rc = usb_stor_ctrl_transfer(us, us->send_ctrl_pipe, 0, 0x20, 0, 6, command, 2); if (rc != USB_STOR_XFER_GOOD) { usb_stor_dbg(us, "Gah! send_control for read_capacity failed\n"); rc = USB_STOR_TRANSPORT_ERROR; goto leave; } // read the reply rc = jumpshot_bulk_read(us, reply, 512); if (rc != USB_STOR_XFER_GOOD) { rc = USB_STOR_TRANSPORT_ERROR; goto leave; } info->sectors = ((u32)(reply[117]) << 24) | ((u32)(reply[116]) << 16) | ((u32)(reply[115]) << 8) | ((u32)(reply[114]) ); rc = USB_STOR_TRANSPORT_GOOD; leave: kfree(reply); return rc; } static int jumpshot_handle_mode_sense(struct us_data *us, struct scsi_cmnd * srb, int sense_6) { static const unsigned char rw_err_page[12] = { 0x1, 0xA, 0x21, 1, 0, 0, 0, 0, 1, 0, 0, 0 }; static const unsigned char cache_page[12] = { 0x8, 0xA, 0x1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; static const unsigned char rbac_page[12] = { 0x1B, 0xA, 0, 0x81, 0, 0, 0, 0, 0, 0, 0, 0 }; static const unsigned char timer_page[8] = { 0x1C, 0x6, 0, 0, 0, 0 }; unsigned char pc, page_code; unsigned int i = 0; struct jumpshot_info *info = (struct jumpshot_info *) (us->extra); unsigned char *ptr = us->iobuf; pc = srb->cmnd[2] >> 6; page_code = srb->cmnd[2] & 0x3F; switch (pc) { case 0x0: usb_stor_dbg(us, "Current values\n"); break; case 0x1: usb_stor_dbg(us, "Changeable values\n"); break; case 0x2: usb_stor_dbg(us, "Default values\n"); break; case 0x3: usb_stor_dbg(us, "Saves values\n"); break; } memset(ptr, 0, 8); if (sense_6) { ptr[2] = 0x00; // WP enable: 0x80 i = 4; } else { ptr[3] = 0x00; // WP enable: 0x80 i = 8; } switch (page_code) { case 0x0: // vendor-specific mode info->sense_key = 0x05; info->sense_asc = 0x24; info->sense_ascq = 0x00; return USB_STOR_TRANSPORT_FAILED; case 0x1: memcpy(ptr + i, rw_err_page, sizeof(rw_err_page)); i += sizeof(rw_err_page); break; case 0x8: memcpy(ptr + i, cache_page, sizeof(cache_page)); i += sizeof(cache_page); break; case 0x1B: memcpy(ptr + i, rbac_page, sizeof(rbac_page)); i += sizeof(rbac_page); break; case 0x1C: memcpy(ptr + i, timer_page, sizeof(timer_page)); i += sizeof(timer_page); break; case 0x3F: memcpy(ptr + i, timer_page, sizeof(timer_page)); i += sizeof(timer_page); memcpy(ptr + i, rbac_page, sizeof(rbac_page)); i += sizeof(rbac_page); memcpy(ptr + i, cache_page, sizeof(cache_page)); i += sizeof(cache_page); memcpy(ptr + i, rw_err_page, sizeof(rw_err_page)); i += sizeof(rw_err_page); break; } if (sense_6) ptr[0] = i - 1; else ((__be16 *) ptr)[0] = cpu_to_be16(i - 2); usb_stor_set_xfer_buf(ptr, i, srb); return USB_STOR_TRANSPORT_GOOD; } static void jumpshot_info_destructor(void *extra) { // this routine is a placeholder... // currently, we don't allocate any extra blocks so we're okay } // Transport for the Lexar 'Jumpshot' // static int jumpshot_transport(struct scsi_cmnd *srb, struct us_data *us) { struct jumpshot_info *info; int rc; unsigned long block, blocks; unsigned char *ptr = us->iobuf; static const unsigned char inquiry_response[8] = { 0x00, 0x80, 0x00, 0x01, 0x1F, 0x00, 0x00, 0x00 }; if (!us->extra) { us->extra = kzalloc(sizeof(struct jumpshot_info), GFP_NOIO); if (!us->extra) return USB_STOR_TRANSPORT_ERROR; us->extra_destructor = jumpshot_info_destructor; } info = (struct jumpshot_info *) (us->extra); if (srb->cmnd[0] == INQUIRY) { usb_stor_dbg(us, "INQUIRY - Returning bogus response\n"); memcpy(ptr, inquiry_response, sizeof(inquiry_response)); fill_inquiry_response(us, ptr, 36); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == READ_CAPACITY) { info->ssize = 0x200; // hard coded 512 byte sectors as per ATA spec rc = jumpshot_get_status(us); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; rc = jumpshot_id_device(us, info); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; usb_stor_dbg(us, "READ_CAPACITY: %ld sectors, %ld bytes per sector\n", info->sectors, info->ssize); // build the reply // ((__be32 *) ptr)[0] = cpu_to_be32(info->sectors - 1); ((__be32 *) ptr)[1] = cpu_to_be32(info->ssize); usb_stor_set_xfer_buf(ptr, 8, srb); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == MODE_SELECT_10) { usb_stor_dbg(us, "Gah! MODE_SELECT_10\n"); return USB_STOR_TRANSPORT_ERROR; } if (srb->cmnd[0] == READ_10) { block = ((u32)(srb->cmnd[2]) << 24) | ((u32)(srb->cmnd[3]) << 16) | ((u32)(srb->cmnd[4]) << 8) | ((u32)(srb->cmnd[5])); blocks = ((u32)(srb->cmnd[7]) << 8) | ((u32)(srb->cmnd[8])); usb_stor_dbg(us, "READ_10: read block 0x%04lx count %ld\n", block, blocks); return jumpshot_read_data(us, info, block, blocks); } if (srb->cmnd[0] == READ_12) { // I don't think we'll ever see a READ_12 but support it anyway... // block = ((u32)(srb->cmnd[2]) << 24) | ((u32)(srb->cmnd[3]) << 16) | ((u32)(srb->cmnd[4]) << 8) | ((u32)(srb->cmnd[5])); blocks = ((u32)(srb->cmnd[6]) << 24) | ((u32)(srb->cmnd[7]) << 16) | ((u32)(srb->cmnd[8]) << 8) | ((u32)(srb->cmnd[9])); usb_stor_dbg(us, "READ_12: read block 0x%04lx count %ld\n", block, blocks); return jumpshot_read_data(us, info, block, blocks); } if (srb->cmnd[0] == WRITE_10) { block = ((u32)(srb->cmnd[2]) << 24) | ((u32)(srb->cmnd[3]) << 16) | ((u32)(srb->cmnd[4]) << 8) | ((u32)(srb->cmnd[5])); blocks = ((u32)(srb->cmnd[7]) << 8) | ((u32)(srb->cmnd[8])); usb_stor_dbg(us, "WRITE_10: write block 0x%04lx count %ld\n", block, blocks); return jumpshot_write_data(us, info, block, blocks); } if (srb->cmnd[0] == WRITE_12) { // I don't think we'll ever see a WRITE_12 but support it anyway... // block = ((u32)(srb->cmnd[2]) << 24) | ((u32)(srb->cmnd[3]) << 16) | ((u32)(srb->cmnd[4]) << 8) | ((u32)(srb->cmnd[5])); blocks = ((u32)(srb->cmnd[6]) << 24) | ((u32)(srb->cmnd[7]) << 16) | ((u32)(srb->cmnd[8]) << 8) | ((u32)(srb->cmnd[9])); usb_stor_dbg(us, "WRITE_12: write block 0x%04lx count %ld\n", block, blocks); return jumpshot_write_data(us, info, block, blocks); } if (srb->cmnd[0] == TEST_UNIT_READY) { usb_stor_dbg(us, "TEST_UNIT_READY\n"); return jumpshot_get_status(us); } if (srb->cmnd[0] == REQUEST_SENSE) { usb_stor_dbg(us, "REQUEST_SENSE\n"); memset(ptr, 0, 18); ptr[0] = 0xF0; ptr[2] = info->sense_key; ptr[7] = 11; ptr[12] = info->sense_asc; ptr[13] = info->sense_ascq; usb_stor_set_xfer_buf(ptr, 18, srb); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == MODE_SENSE) { usb_stor_dbg(us, "MODE_SENSE_6 detected\n"); return jumpshot_handle_mode_sense(us, srb, 1); } if (srb->cmnd[0] == MODE_SENSE_10) { usb_stor_dbg(us, "MODE_SENSE_10 detected\n"); return jumpshot_handle_mode_sense(us, srb, 0); } if (srb->cmnd[0] == ALLOW_MEDIUM_REMOVAL) { /* * sure. whatever. not like we can stop the user from popping * the media out of the device (no locking doors, etc) */ return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == START_STOP) { /* * this is used by sd.c'check_scsidisk_media_change to detect * media change */ usb_stor_dbg(us, "START_STOP\n"); /* * the first jumpshot_id_device after a media change returns * an error (determined experimentally) */ rc = jumpshot_id_device(us, info); if (rc == USB_STOR_TRANSPORT_GOOD) { info->sense_key = NO_SENSE; srb->result = SUCCESS; } else { info->sense_key = UNIT_ATTENTION; srb->result = SAM_STAT_CHECK_CONDITION; } return rc; } usb_stor_dbg(us, "Gah! Unknown command: %d (0x%x)\n", srb->cmnd[0], srb->cmnd[0]); info->sense_key = 0x05; info->sense_asc = 0x20; info->sense_ascq = 0x00; return USB_STOR_TRANSPORT_FAILED; } static struct scsi_host_template jumpshot_host_template; static int jumpshot_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct us_data *us; int result; result = usb_stor_probe1(&us, intf, id, (id - jumpshot_usb_ids) + jumpshot_unusual_dev_list, &jumpshot_host_template); if (result) return result; us->transport_name = "Lexar Jumpshot Control/Bulk"; us->transport = jumpshot_transport; us->transport_reset = usb_stor_Bulk_reset; us->max_lun = 1; result = usb_stor_probe2(us); return result; } static struct usb_driver jumpshot_driver = { .name = DRV_NAME, .probe = jumpshot_probe, .disconnect = usb_stor_disconnect, .suspend = usb_stor_suspend, .resume = usb_stor_resume, .reset_resume = usb_stor_reset_resume, .pre_reset = usb_stor_pre_reset, .post_reset = usb_stor_post_reset, .id_table = jumpshot_usb_ids, .soft_unbind = 1, .no_dynamic_id = 1, }; module_usb_stor_driver(jumpshot_driver, jumpshot_host_template, DRV_NAME);
9 9 9 4994 4984 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 // SPDX-License-Identifier: GPL-2.0 /* * Block rq-qos policy for assigning an I/O priority class to requests. * * Using an rq-qos policy for assigning I/O priority class has two advantages * over using the ioprio_set() system call: * * - This policy is cgroup based so it has all the advantages of cgroups. * - While ioprio_set() does not affect page cache writeback I/O, this rq-qos * controller affects page cache writeback I/O for filesystems that support * assiociating a cgroup with writeback I/O. See also * Documentation/admin-guide/cgroup-v2.rst. */ #include <linux/blk-mq.h> #include <linux/blk_types.h> #include <linux/kernel.h> #include <linux/module.h> #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-rq-qos.h" /** * enum prio_policy - I/O priority class policy. * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class. * @POLICY_PROMOTE_TO_RT: modify no-IOPRIO_CLASS_RT to IOPRIO_CLASS_RT. * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into * IOPRIO_CLASS_BE. * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE. * @POLICY_NONE_TO_RT: an alias for POLICY_PROMOTE_TO_RT. * * See also <linux/ioprio.h>. */ enum prio_policy { POLICY_NO_CHANGE = 0, POLICY_PROMOTE_TO_RT = 1, POLICY_RESTRICT_TO_BE = 2, POLICY_ALL_TO_IDLE = 3, POLICY_NONE_TO_RT = 4, }; static const char *policy_name[] = { [POLICY_NO_CHANGE] = "no-change", [POLICY_PROMOTE_TO_RT] = "promote-to-rt", [POLICY_RESTRICT_TO_BE] = "restrict-to-be", [POLICY_ALL_TO_IDLE] = "idle", [POLICY_NONE_TO_RT] = "none-to-rt", }; static struct blkcg_policy ioprio_policy; /** * struct ioprio_blkcg - Per cgroup data. * @cpd: blkcg_policy_data structure. * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>. */ struct ioprio_blkcg { struct blkcg_policy_data cpd; enum prio_policy prio_policy; }; static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg) { return container_of(blkcg_to_cpd(blkcg, &ioprio_policy), struct ioprio_blkcg, cpd); } static struct ioprio_blkcg * ioprio_blkcg_from_css(struct cgroup_subsys_state *css) { return blkcg_to_ioprio_blkcg(css_to_blkcg(css)); } static int ioprio_show_prio_policy(struct seq_file *sf, void *v) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf)); seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]); return 0; } static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of)); int ret; if (off != 0) return -EIO; /* kernfs_fop_write_iter() terminates 'buf' with '\0'. */ ret = sysfs_match_string(policy_name, buf); if (ret < 0) return ret; blkcg->prio_policy = ret; return nbytes; } static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp) { struct ioprio_blkcg *blkcg; blkcg = kzalloc(sizeof(*blkcg), gfp); if (!blkcg) return NULL; blkcg->prio_policy = POLICY_NO_CHANGE; return &blkcg->cpd; } static void ioprio_free_cpd(struct blkcg_policy_data *cpd) { struct ioprio_blkcg *blkcg = container_of(cpd, typeof(*blkcg), cpd); kfree(blkcg); } static struct cftype ioprio_files[] = { { .name = "prio.class", .seq_show = ioprio_show_prio_policy, .write = ioprio_set_prio_policy, }, { } /* sentinel */ }; static struct blkcg_policy ioprio_policy = { .dfl_cftypes = ioprio_files, .legacy_cftypes = ioprio_files, .cpd_alloc_fn = ioprio_alloc_cpd, .cpd_free_fn = ioprio_free_cpd, }; void blkcg_set_ioprio(struct bio *bio) { struct ioprio_blkcg *blkcg = blkcg_to_ioprio_blkcg(bio->bi_blkg->blkcg); u16 prio; if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) return; if (blkcg->prio_policy == POLICY_PROMOTE_TO_RT || blkcg->prio_policy == POLICY_NONE_TO_RT) { /* * For RT threads, the default priority level is 4 because * task_nice is 0. By promoting non-RT io-priority to RT-class * and default level 4, those requests that are already * RT-class but need a higher io-priority can use ioprio_set() * to achieve this. */ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) != IOPRIO_CLASS_RT) bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 4); return; } /* * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers * correspond to a lower priority. Hence, the max_t() below selects * the lower priority of bi_ioprio and the cgroup I/O priority class. * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O * priority is assigned to the bio. */ prio = max_t(u16, bio->bi_ioprio, IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); if (prio > bio->bi_ioprio) bio->bi_ioprio = prio; } static int __init ioprio_init(void) { return blkcg_policy_register(&ioprio_policy); } static void __exit ioprio_exit(void) { blkcg_policy_unregister(&ioprio_policy); } module_init(ioprio_init); module_exit(ioprio_exit);
114 119 119 92 114 7 7 119 78 119 119 74 75 6 6 6 6 6 23 91 91 87 91 91 114 99 101 101 101 99 101 113 114 114 114 114 113 113 2 2 112 113 112 113 1 114 2 2 2 2 2 2 2 2 7 7 6 6 6 6 5 1 5 6 1 6 6 6 6 6 6 6 92 92 92 90 92 91 92 132 97 113 113 113 114 113 113 87 87 1 114 1 114 113 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 91 91 37 172 172 171 40 41 41 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bbpos.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_iter.h" #include "btree_locking.h" #include "debug.h" #include "errcode.h" #include "error.h" #include "journal.h" #include "trace.h" #include <linux/prefetch.h> #include <linux/sched/mm.h> #include <linux/swap.h> const char * const bch2_btree_node_flags[] = { "typebit", "typebit", "typebit", #define x(f) [BTREE_NODE_##f] = #f, BTREE_FLAGS() #undef x NULL }; void bch2_recalc_btree_reserve(struct bch_fs *c) { unsigned reserve = 16; if (!c->btree_roots_known[0].b) reserve += 8; for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (r->b) reserve += min_t(unsigned, 1, r->b->c.level) * 8; } c->btree_cache.nr_reserve = reserve; } static inline size_t btree_cache_can_free(struct btree_cache_list *list) { struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); size_t can_free = list->nr; if (!list->idx) can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); return can_free; } static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) { BUG_ON(!list_empty(&b->list)); if (b->c.lock.readers) list_add(&b->list, &bc->freed_pcpu); else list_add(&b->list, &bc->freed_nonpcpu); } static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b) { BUG_ON(!list_empty(&b->list)); BUG_ON(!b->data); bc->nr_freeable++; list_add(&b->list, &bc->freeable); } void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; mutex_lock(&bc->lock); __bch2_btree_node_to_freelist(bc, b); mutex_unlock(&bc->lock); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } void __btree_node_data_free(struct btree *b) { BUG_ON(!list_empty(&b->list)); BUG_ON(btree_node_hashed(b)); /* * This should really be done in slub/vmalloc, but we're using the * kmalloc_large() path, so we're working around a slub bug by doing * this here: */ if (b->data) mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); if (b->aux_data) mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); EBUG_ON(btree_node_write_in_flight(b)); clear_btree_node_just_written(b); kvfree(b->data); b->data = NULL; #ifdef __KERNEL__ kvfree(b->aux_data); #else munmap(b->aux_data, btree_aux_data_bytes(b)); #endif b->aux_data = NULL; } static void btree_node_data_free(struct btree_cache *bc, struct btree *b) { BUG_ON(list_empty(&b->list)); list_del_init(&b->list); __btree_node_data_free(b); --bc->nr_freeable; btree_node_to_freedlist(bc, b); } static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { const struct btree *b = obj; const u64 *v = arg->key; return b->hash_val == *v ? 0 : 1; } static const struct rhashtable_params bch_btree_cache_params = { .head_offset = offsetof(struct btree, hash), .key_offset = offsetof(struct btree, hash_val), .key_len = sizeof(u64), .obj_cmpfn = bch2_btree_cache_cmp_fn, .automatic_shrinking = true, }; static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; b->data = kvmalloc(btree_buf_bytes(b), gfp); if (!b->data) return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); #ifdef __KERNEL__ b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); #else b->aux_data = mmap(NULL, btree_aux_data_bytes(b), PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (b->aux_data == MAP_FAILED) b->aux_data = NULL; #endif if (!b->aux_data) { kvfree(b->data); b->data = NULL; return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); } return 0; } static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) { struct btree *b; b = kzalloc(sizeof(struct btree), gfp); if (!b) return NULL; bkey_btree_ptr_init(&b->key); INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); b->byte_order = ilog2(c->opts.btree_node_size); return b; } struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) { struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) return NULL; if (btree_node_data_alloc(c, b, GFP_KERNEL)) { kfree(b); return NULL; } bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); return b; } static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) { struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); u64 mask = bc->pinned_nodes_mask[!!b->c.level]; return ((mask & BIT_ULL(b->c.btree_id)) && bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); } void bch2_node_pin(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; mutex_lock(&bc->lock); if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { set_btree_node_pinned(b); list_move(&b->list, &bc->live[1].list); bc->live[0].nr--; bc->live[1].nr++; } mutex_unlock(&bc->lock); } void bch2_btree_cache_unpin(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; struct btree *b, *n; mutex_lock(&bc->lock); c->btree_cache.pinned_nodes_mask[0] = 0; c->btree_cache.pinned_nodes_mask[1] = 0; list_for_each_entry_safe(b, n, &bc->live[1].list, list) { clear_btree_node_pinned(b); list_move(&b->list, &bc->live[0].list); bc->live[0].nr++; bc->live[1].nr--; } mutex_unlock(&bc->lock); } /* Btree in memory cache - hash table */ void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) { lockdep_assert_held(&bc->lock); int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); BUG_ON(ret); /* Cause future lookups for this node to fail: */ b->hash_val = 0; if (b->c.btree_id < BTREE_ID_NR) --bc->nr_by_btree[b->c.btree_id]; --bc->live[btree_node_pinned(b)].nr; list_del_init(&b->list); } void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) { __bch2_btree_node_hash_remove(bc, b); __bch2_btree_node_to_freelist(bc, b); } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) { BUG_ON(!list_empty(&b->list)); BUG_ON(b->hash_val); b->hash_val = btree_ptr_hash_val(&b->key); int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, bch_btree_cache_params); if (ret) return ret; if (b->c.btree_id < BTREE_ID_NR) bc->nr_by_btree[b->c.btree_id]++; bool p = __btree_node_pinned(bc, b); mod_bit(BTREE_NODE_pinned, &b->flags, p); list_add_tail(&b->list, &bc->live[p].list); bc->live[p].nr++; return 0; } int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, unsigned level, enum btree_id id) { b->c.level = level; b->c.btree_id = id; mutex_lock(&bc->lock); int ret = __bch2_btree_node_hash_insert(bc, b); mutex_unlock(&bc->lock); return ret; } void bch2_btree_node_update_key_early(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_i *new) { struct bch_fs *c = trans->c; struct btree *b; struct bkey_buf tmp; int ret; bch2_bkey_buf_init(&tmp); bch2_bkey_buf_reassemble(&tmp, c, old); b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); if (!IS_ERR_OR_NULL(b)) { mutex_lock(&c->btree_cache.lock); __bch2_btree_node_hash_remove(&c->btree_cache, b); bkey_copy(&b->key, new); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); BUG_ON(ret); mutex_unlock(&c->btree_cache.lock); six_unlock_read(&b->c.lock); } bch2_bkey_buf_exit(&tmp, c); } __flatten static inline struct btree *btree_cache_find(struct btree_cache *bc, const struct bkey_i *k) { u64 v = btree_ptr_hash_val(k); return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); } static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, bool flush, bool locked) { struct btree_cache *bc = &c->btree_cache; lockdep_assert_held(&bc->lock); if (btree_node_noevict(b)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (btree_node_write_blocked(b)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (btree_node_will_make_reachable(b)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (btree_node_dirty(b)) { if (!flush) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (locked) { /* * Using the underscore version because we don't want to compact * bsets after the write, since this node is about to be evicted * - unless btree verify mode is enabled, since it runs out of * the post write cleanup: */ if (static_branch_unlikely(&bch2_verify_btree_ondisk)) bch2_btree_node_write(c, b, SIX_LOCK_intent, BTREE_WRITE_cache_reclaim); else __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); } } if (b->flags & ((1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { if (!flush) { if (btree_node_read_in_flight(b)) bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; else if (btree_node_write_in_flight(b)) bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (locked) return -EINTR; /* XXX: waiting on IO with btree cache lock held */ bch2_btree_node_wait_on_read(b); bch2_btree_node_wait_on_write(b); } return 0; } /* * this version is for btree nodes that have already been freed (we're not * reaping a real btree node) */ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) { struct btree_cache *bc = &c->btree_cache; int ret = 0; lockdep_assert_held(&bc->lock); retry_unlocked: ret = __btree_node_reclaim_checks(c, b, flush, false); if (ret) return ret; if (!six_trylock_intent(&b->c.lock)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (!six_trylock_write(&b->c.lock)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++; six_unlock_intent(&b->c.lock); return bch_err_throw(c, ENOMEM_btree_node_reclaim); } /* recheck under lock */ ret = __btree_node_reclaim_checks(c, b, flush, true); if (ret) { six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); if (ret == -EINTR) goto retry_unlocked; return ret; } if (b->hash_val && !ret) trace_and_count(c, btree_cache_reap, c, b); return 0; } static int btree_node_reclaim(struct bch_fs *c, struct btree *b) { return __btree_node_reclaim(c, b, false); } static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) { return __btree_node_reclaim(c, b, true); } static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct btree_cache_list *list = shrink->private_data; struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); struct btree *b, *t; unsigned long nr = sc->nr_to_scan; unsigned long can_free = 0; unsigned long freed = 0; unsigned long touched = 0; unsigned i, flags; unsigned long ret = SHRINK_STOP; bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) return SHRINK_STOP; mutex_lock(&bc->lock); flags = memalloc_nofs_save(); /* * It's _really_ critical that we don't free too many btree nodes - we * have to always leave ourselves a reserve. The reserve is how we * guarantee that allocating memory for a new btree node can always * succeed, so that inserting keys into the btree can always succeed and * IO can always make forward progress: */ can_free = btree_cache_can_free(list); if (nr > can_free) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free; nr = can_free; } i = 0; list_for_each_entry_safe(b, t, &bc->freeable, list) { /* * Leave a few nodes on the freeable list, so that a btree split * won't have to hit the system allocator: */ if (++i <= 3) continue; touched++; if (touched >= nr) goto out; if (!btree_node_reclaim(c, b)) { btree_node_data_free(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; bc->nr_freed++; } } restart: list_for_each_entry_safe(b, t, &list->list, list) { touched++; if (btree_node_accessed(b)) { clear_btree_node_accessed(b); bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; --touched;; } else if (!btree_node_reclaim(c, b)) { __bch2_btree_node_hash_remove(bc, b); __btree_node_data_free(b); btree_node_to_freedlist(bc, b); freed++; bc->nr_freed++; six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); if (freed == nr) goto out_rotate; } else if (trigger_writes && btree_node_dirty(b) && !btree_node_will_make_reachable(b) && !btree_node_write_blocked(b) && six_trylock_read(&b->c.lock)) { list_move(&list->list, &b->list); mutex_unlock(&bc->lock); __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); six_unlock_read(&b->c.lock); if (touched >= nr) goto out_nounlock; mutex_lock(&bc->lock); goto restart; } if (touched >= nr) break; } out_rotate: if (&t->list != &list->list) list_move_tail(&list->list, &t->list); out: mutex_unlock(&bc->lock); out_nounlock: ret = freed; memalloc_nofs_restore(flags); trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); return ret; } static unsigned long bch2_btree_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct btree_cache_list *list = shrink->private_data; if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) return 0; return btree_cache_can_free(list); } void bch2_fs_btree_cache_exit(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; struct btree *b, *t; unsigned long flags; shrinker_free(bc->live[1].shrink); shrinker_free(bc->live[0].shrink); /* vfree() can allocate memory: */ flags = memalloc_nofs_save(); mutex_lock(&bc->lock); if (c->verify_data) list_move(&c->verify_data->list, &bc->live[0].list); kvfree(c->verify_ondisk); for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (r->b) list_add(&r->b->list, &bc->live[0].list); } list_for_each_entry_safe(b, t, &bc->live[1].list, list) bch2_btree_node_hash_remove(bc, b); list_for_each_entry_safe(b, t, &bc->live[0].list, list) bch2_btree_node_hash_remove(bc, b); list_for_each_entry_safe(b, t, &bc->freeable, list) { BUG_ON(btree_node_read_in_flight(b) || btree_node_write_in_flight(b)); btree_node_data_free(bc, b); cond_resched(); } BUG_ON(!bch2_journal_error(&c->journal) && atomic_long_read(&c->btree_cache.nr_dirty)); list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { list_del(&b->list); six_lock_exit(&b->c.lock); kfree(b); } mutex_unlock(&bc->lock); memalloc_nofs_restore(flags); for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) BUG_ON(bc->nr_by_btree[i]); BUG_ON(bc->live[0].nr); BUG_ON(bc->live[1].nr); BUG_ON(bc->nr_freeable); if (bc->table_init_done) rhashtable_destroy(&bc->table); } int bch2_fs_btree_cache_init(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; struct shrinker *shrink; unsigned i; int ret = 0; ret = rhashtable_init(&bc->table, &bch_btree_cache_params); if (ret) goto err; bc->table_init_done = true; bch2_recalc_btree_reserve(c); for (i = 0; i < bc->nr_reserve; i++) { struct btree *b = __bch2_btree_node_mem_alloc(c); if (!b) goto err; __bch2_btree_node_to_freelist(bc, b); } list_splice_init(&bc->live[0].list, &bc->freeable); mutex_init(&c->verify_lock); shrink = shrinker_alloc(0, "%s-btree_cache", c->name); if (!shrink) goto err; bc->live[0].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; shrink->seeks = 2; shrink->private_data = &bc->live[0]; shrinker_register(shrink); shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); if (!shrink) goto err; bc->live[1].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; shrink->seeks = 8; shrink->private_data = &bc->live[1]; shrinker_register(shrink); return 0; err: return bch_err_throw(c, ENOMEM_fs_btree_cache_init); } void bch2_fs_btree_cache_init_early(struct btree_cache *bc) { mutex_init(&bc->lock); for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { bc->live[i].idx = i; INIT_LIST_HEAD(&bc->live[i].list); } INIT_LIST_HEAD(&bc->freeable); INIT_LIST_HEAD(&bc->freed_pcpu); INIT_LIST_HEAD(&bc->freed_nonpcpu); } /* * We can only have one thread cannibalizing other cached btree nodes at a time, * or we'll deadlock. We use an open coded mutex to ensure that, which a * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; if (bc->alloc_lock == current) { trace_and_count(c, btree_cache_cannibalize_unlock, trans); bc->alloc_lock = NULL; closure_wake_up(&bc->alloc_wait); } } int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct task_struct *old; old = NULL; if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) goto success; if (!cl) { trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock); } closure_wait(&bc->alloc_wait, cl); /* Try again, after adding ourselves to waitlist */ old = NULL; if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) { /* We raced */ closure_wake_up(&bc->alloc_wait); goto success; } trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); return bch_err_throw(c, btree_cache_cannibalize_lock_blocked); success: trace_and_count(c, btree_cache_cannibalize_lock, trans); return 0; } static struct btree *btree_node_cannibalize(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; struct btree *b; for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) list_for_each_entry_reverse(b, &bc->live[i].list, list) if (!btree_node_reclaim(c, b)) return b; while (1) { for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) list_for_each_entry_reverse(b, &bc->live[i].list, list) if (!btree_node_write_and_reclaim(c, b)) return b; /* * Rare case: all nodes were intent-locked. * Just busy-wait. */ WARN_ONCE(1, "btree cache cannibalize failed\n"); cond_resched(); } } struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct list_head *freed = pcpu_read_locks ? &bc->freed_pcpu : &bc->freed_nonpcpu; struct btree *b, *b2; u64 start_time = local_clock(); mutex_lock(&bc->lock); /* * We never free struct btree itself, just the memory that holds the on * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, freed, list) if (!btree_node_reclaim(c, b)) { list_del_init(&b->list); goto got_node; } b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); if (b) { bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); } else { mutex_unlock(&bc->lock); bch2_trans_unlock(trans); b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) goto err; bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); mutex_lock(&bc->lock); } BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); got_node: /* * btree_free() doesn't free memory; it sticks the node on the end of * the list. Check if there's any freed nodes there: */ list_for_each_entry(b2, &bc->freeable, list) if (!btree_node_reclaim(c, b2)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); list_del_init(&b2->list); --bc->nr_freeable; btree_node_to_freedlist(bc, b2); mutex_unlock(&bc->lock); six_unlock_write(&b2->c.lock); six_unlock_intent(&b2->c.lock); goto got_mem; } mutex_unlock(&bc->lock); if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { bch2_trans_unlock(trans); if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) goto err; } got_mem: BUG_ON(!list_empty(&b->list)); BUG_ON(btree_node_hashed(b)); BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_write_in_flight(b)); out: b->flags = 0; b->written = 0; b->nsets = 0; b->sib_u64s[0] = 0; b->sib_u64s[1] = 0; b->whiteout_u64s = 0; bch2_btree_keys_init(b); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); int ret = bch2_trans_relock(trans); if (unlikely(ret)) { bch2_btree_node_to_freelist(c, b); return ERR_PTR(ret); } return b; err: mutex_lock(&bc->lock); /* Try to cannibalize another cached btree node: */ if (bc->alloc_lock == current) { b2 = btree_node_cannibalize(c); clear_btree_node_just_written(b2); __bch2_btree_node_hash_remove(bc, b2); if (b) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); btree_node_to_freedlist(bc, b2); six_unlock_write(&b2->c.lock); six_unlock_intent(&b2->c.lock); } else { b = b2; } BUG_ON(!list_empty(&b->list)); mutex_unlock(&bc->lock); trace_and_count(c, btree_cache_cannibalize, trans); goto out; } mutex_unlock(&bc->lock); return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); } /* Slowpath, don't want it inlined into btree_iter_traverse() */ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, struct btree_path *path, const struct bkey_i *k, enum btree_id btree_id, unsigned level, enum six_lock_type lock_type, bool sync) { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; if (unlikely(level >= BTREE_MAX_DEPTH)) { int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", level, BTREE_MAX_DEPTH); return ERR_PTR(ret); } if (unlikely(!bkey_is_btree_ptr(&k->k))) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); printbuf_exit(&buf); return ERR_PTR(ret); } if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); printbuf_exit(&buf); return ERR_PTR(ret); } /* * Parent node must be locked, else we could read in a btree node that's * been freed: */ if (path && !bch2_btree_node_relock(trans, path, level + 1)) { trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); } b = bch2_btree_node_mem_alloc(trans, level != 0); if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { if (!path) return b; trans->memory_allocation_failure = true; trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); } if (IS_ERR(b)) return b; bkey_copy(&b->key, k); if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { /* raced with another fill: */ /* mark as unhashed... */ b->hash_val = 0; mutex_lock(&bc->lock); __bch2_btree_node_to_freelist(bc, b); mutex_unlock(&bc->lock); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); return NULL; } set_btree_node_read_in_flight(b); six_unlock_write(&b->c.lock); if (path) { u32 seq = six_lock_seq(&b->c.lock); /* Unlock before doing IO: */ six_unlock_intent(&b->c.lock); bch2_trans_unlock(trans); bch2_btree_node_read(trans, b, sync); int ret = bch2_trans_relock(trans); if (ret) return ERR_PTR(ret); if (!sync) return NULL; if (!six_relock_type(&b->c.lock, lock_type, seq)) b = NULL; } else { bch2_btree_node_read(trans, b, sync); if (lock_type == SIX_LOCK_read) six_lock_downgrade(&b->c.lock); } return b; } static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) { struct printbuf buf = PRINTBUF; if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) return; prt_printf(&buf, "btree node header doesn't match ptr: "); bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); prt_str(&buf, "\nptr: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); prt_str(&buf, "\nheader: "); bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data)); prt_str(&buf, "\nmin "); bch2_bpos_to_text(&buf, b->data->min_key); prt_printf(&buf, "\nmax "); bch2_bpos_to_text(&buf, b->data->max_key); bch2_fs_topology_error(c, "%s", buf.buf); printbuf_exit(&buf); } static inline void btree_check_header(struct bch_fs *c, struct btree *b) { if (b->c.btree_id != BTREE_NODE_ID(b->data) || b->c.level != BTREE_NODE_LEVEL(b->data) || !bpos_eq(b->data->max_key, b->key.k.p) || (b->key.k.type == KEY_TYPE_btree_ptr_v2 && !bpos_eq(b->data->min_key, bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) btree_bad_header(c, b); } static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, const struct bkey_i *k, unsigned level, enum six_lock_type lock_type, unsigned long trace_ip) { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; bool need_relock = false; int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); retry: b = btree_cache_find(bc, k); if (unlikely(!b)) { /* * We must have the parent locked to call bch2_btree_node_fill(), * else we could read in a btree node from disk that's been * freed: */ b = bch2_btree_node_fill(trans, path, k, path->btree_id, level, lock_type, true); need_relock = true; /* We raced and found the btree node in the cache */ if (!b) goto retry; if (IS_ERR(b)) return b; } else { if (btree_node_read_locked(path, level + 1)) btree_node_unlock(trans, path, level + 1); ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ERR_PTR(ret); BUG_ON(ret); if (unlikely(b->hash_val != btree_ptr_hash_val(k) || b->c.level != level || race_fault())) { six_unlock_type(&b->c.lock, lock_type); if (bch2_btree_node_relock(trans, path, level + 1)) goto retry; trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); } /* avoid atomic set bit if it's not needed: */ if (!btree_node_accessed(b)) set_btree_node_accessed(b); } if (unlikely(btree_node_read_in_flight(b))) { u32 seq = six_lock_seq(&b->c.lock); six_unlock_type(&b->c.lock, lock_type); bch2_trans_unlock(trans); need_relock = true; bch2_btree_node_wait_on_read(b); ret = bch2_trans_relock(trans); if (ret) return ERR_PTR(ret); /* * should_be_locked is not set on this path yet, so we need to * relock it specifically: */ if (!six_relock_type(&b->c.lock, lock_type, seq)) goto retry; } if (unlikely(need_relock)) { ret = bch2_trans_relock(trans) ?: bch2_btree_path_relock_intent(trans, path); if (ret) { six_unlock_type(&b->c.lock, lock_type); return ERR_PTR(ret); } } prefetch(b->aux_data); for_each_bset(b, t) { void *p = (u64 *) b->aux_data + t->aux_data_offset; prefetch(p + L1_CACHE_BYTES * 0); prefetch(p + L1_CACHE_BYTES * 1); prefetch(p + L1_CACHE_BYTES * 2); } if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); } EBUG_ON(b->c.btree_id != path->btree_id); EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); btree_check_header(c, b); return b; } /** * bch2_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. * * @trans: btree transaction object * @path: btree_path being traversed * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) * @level: level of btree node being looked up (0 == leaf node) * @lock_type: SIX_LOCK_read or SIX_LOCK_intent * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) * * The btree node will have either a read or a write lock held, depending on * the @write parameter. * * Returns: btree node or ERR_PTR() */ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, const struct bkey_i *k, unsigned level, enum six_lock_type lock_type, unsigned long trace_ip) { struct bch_fs *c = trans->c; struct btree *b; int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); b = btree_node_mem_ptr(k); /* * Check b->hash_val _before_ calling btree_node_lock() - this might not * be the node we want anymore, and trying to lock the wrong node could * cause an unneccessary transaction restart: */ if (unlikely(!c->opts.btree_node_mem_ptr_optimization || !b || b->hash_val != btree_ptr_hash_val(k))) return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); if (btree_node_read_locked(path, level + 1)) btree_node_unlock(trans, path, level + 1); ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ERR_PTR(ret); BUG_ON(ret); if (unlikely(b->hash_val != btree_ptr_hash_val(k) || b->c.level != level || race_fault())) { six_unlock_type(&b->c.lock, lock_type); if (bch2_btree_node_relock(trans, path, level + 1)) return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); } if (unlikely(btree_node_read_in_flight(b))) { six_unlock_type(&b->c.lock, lock_type); return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); } prefetch(b->aux_data); for_each_bset(b, t) { void *p = (u64 *) b->aux_data + t->aux_data_offset; prefetch(p + L1_CACHE_BYTES * 0); prefetch(p + L1_CACHE_BYTES * 1); prefetch(p + L1_CACHE_BYTES * 2); } /* avoid atomic set bit if it's not needed: */ if (!btree_node_accessed(b)) set_btree_node_accessed(b); if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); } EBUG_ON(b->c.btree_id != path->btree_id); EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); btree_check_header(c, b); return b; } struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, const struct bkey_i *k, enum btree_id btree_id, unsigned level, bool nofill) { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); if (c->opts.btree_node_mem_ptr_optimization) { b = btree_node_mem_ptr(k); if (b) goto lock_node; } retry: b = btree_cache_find(bc, k); if (unlikely(!b)) { if (nofill) goto out; b = bch2_btree_node_fill(trans, NULL, k, btree_id, level, SIX_LOCK_read, true); /* We raced and found the btree node in the cache */ if (!b) goto retry; if (IS_ERR(b) && !bch2_btree_cache_cannibalize_lock(trans, NULL)) goto retry; if (IS_ERR(b)) goto out; } else { lock_node: ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ERR_PTR(ret); BUG_ON(ret); if (unlikely(b->hash_val != btree_ptr_hash_val(k) || b->c.btree_id != btree_id || b->c.level != level)) { six_unlock_read(&b->c.lock); goto retry; } /* avoid atomic set bit if it's not needed: */ if (!btree_node_accessed(b)) set_btree_node_accessed(b); } /* XXX: waiting on IO with btree locks held: */ __bch2_btree_node_wait_on_read(b); prefetch(b->aux_data); for_each_bset(b, t) { void *p = (u64 *) b->aux_data + t->aux_data_offset; prefetch(p + L1_CACHE_BYTES * 0); prefetch(p + L1_CACHE_BYTES * 1); prefetch(p + L1_CACHE_BYTES * 2); } if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); goto out; } EBUG_ON(b->c.btree_id != btree_id); EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); btree_check_header(c, b); out: bch2_btree_cache_cannibalize_unlock(trans); return b; } int bch2_btree_node_prefetch(struct btree_trans *trans, struct btree_path *path, const struct bkey_i *k, enum btree_id btree_id, unsigned level) { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; BUG_ON(path && !btree_node_locked(path, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); struct btree *b = btree_cache_find(bc, k); if (b) return 0; b = bch2_btree_node_fill(trans, path, k, btree_id, level, SIX_LOCK_read, false); int ret = PTR_ERR_OR_ZERO(b); if (ret) return ret; if (b) six_unlock_read(&b->c.lock); return 0; } void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; b = btree_cache_find(bc, k); if (!b) return; BUG_ON(b == btree_node_root(trans->c, b)); wait_on_io: /* not allowed to wait on io with btree locks held: */ /* XXX we're called from btree_gc which will be holding other btree * nodes locked */ __bch2_btree_node_wait_on_read(b); __bch2_btree_node_wait_on_write(b); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); if (unlikely(b->hash_val != btree_ptr_hash_val(k))) goto out; if (btree_node_dirty(b)) { __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); goto wait_on_io; } BUG_ON(btree_node_dirty(b)); mutex_lock(&bc->lock); bch2_btree_node_hash_remove(bc, b); btree_node_data_free(bc, b); mutex_unlock(&bc->lock); out: six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } const char *bch2_btree_id_str(enum btree_id btree) { return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; } void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) { if (btree < BTREE_ID_NR) prt_str(out, __bch2_btree_ids[btree]); else prt_printf(out, "(unknown btree %u)", btree); } void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level) { prt_str(out, "btree="); bch2_btree_id_to_text(out, btree); prt_printf(out, " level=%u", level); } void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, enum btree_id btree, unsigned level, struct bkey_s_c k) { bch2_btree_id_to_text(out, btree); prt_printf(out, " level %u/", level); struct btree_root *r = bch2_btree_id_root(c, btree); if (r) prt_printf(out, "%u", r->level); else prt_printf(out, "(unknown)"); prt_newline(out); bch2_bkey_val_to_text(out, c, k); } void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key)); } void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { struct bset_stats stats; memset(&stats, 0, sizeof(stats)); bch2_btree_keys_stats(b, &stats); prt_printf(out, "l %u ", b->c.level); bch2_bpos_to_text(out, b->data->min_key); prt_printf(out, " - "); bch2_bpos_to_text(out, b->data->max_key); prt_printf(out, ":\n" " ptrs: "); bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); prt_newline(out); prt_printf(out, " format: "); bch2_bkey_format_to_text(out, &b->format); prt_printf(out, " unpack fn len: %u\n" " bytes used %zu/%zu (%zu%% full)\n" " sib u64s: %u, %u (merge threshold %u)\n" " nr packed keys %u\n" " nr unpacked keys %u\n" " floats %zu\n" " failed unpacked %zu\n", b->unpack_fn_len, b->nr.live_u64s * sizeof(u64), btree_buf_bytes(b) - sizeof(struct btree_node), b->nr.live_u64s * 100 / btree_max_u64s(c), b->sib_u64s[0], b->sib_u64s[1], c->btree_foreground_merge_threshold, b->nr.packed_keys, b->nr.unpacked_keys, stats.floats, stats.failed); } static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, const char *label, size_t nr) { prt_printf(out, "%s\t", label); prt_human_readable_u64(out, nr * c->opts.btree_node_size); prt_printf(out, " (%zu)\n", nr); } static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { #define x(n) #n, BCH_BTREE_CACHE_NOT_FREED_REASONS() #undef x NULL }; void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); prt_btree_cache_line(out, c, "live:", bc->live[0].nr); prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve); prt_btree_cache_line(out, c, "freed:", bc->nr_freeable); prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held"); prt_newline(out); for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { bch2_btree_id_to_text(out, i); prt_printf(out, "\t"); prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size); prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]); } prt_newline(out); prt_printf(out, "counters since mount:\n"); prt_printf(out, "freed:\t%zu\n", bc->nr_freed); prt_printf(out, "not freed:\n"); for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) prt_printf(out, " %s\t%llu\n", bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); }
8 2 8 6 8 8 3 7 3 7 3 7 3 7 8 3 7 8 8 7 3 25 24 1 22 2 21 1 20 1 20 20 19 1 19 18 18 18 18 17 18 18 18 18 18 18 17 18 12 12 2 3 3 1 3 9 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 // SPDX-License-Identifier: GPL-2.0-only #include <net/netdev_queues.h> #include "netlink.h" #include "common.h" struct rings_req_info { struct ethnl_req_info base; }; struct rings_reply_data { struct ethnl_reply_data base; struct ethtool_ringparam ringparam; struct kernel_ethtool_ringparam kernel_ringparam; u32 supported_ring_params; }; #define RINGS_REPDATA(__reply_base) \ container_of(__reply_base, struct rings_reply_data, base) const struct nla_policy ethnl_rings_get_policy[] = { [ETHTOOL_A_RINGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int rings_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct rings_reply_data *data = RINGS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; data->supported_ring_params = dev->ethtool_ops->supported_ring_params; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->kernel_ringparam.tcp_data_split = dev->cfg->hds_config; data->kernel_ringparam.hds_thresh = dev->cfg->hds_thresh; dev->ethtool_ops->get_ringparam(dev, &data->ringparam, &data->kernel_ringparam, info->extack); ethnl_ops_complete(dev); return 0; } static int rings_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u32)) + /* _RINGS_RX_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO */ nla_total_size(sizeof(u32)) + /* _RINGS_TX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_BUF_LEN */ nla_total_size(sizeof(u8)) + /* _RINGS_TCP_DATA_SPLIT */ nla_total_size(sizeof(u32) + /* _RINGS_CQE_SIZE */ nla_total_size(sizeof(u8)) + /* _RINGS_TX_PUSH */ nla_total_size(sizeof(u8))) + /* _RINGS_RX_PUSH */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_HDS_THRESH */ nla_total_size(sizeof(u32)); /* _RINGS_HDS_THRESH_MAX*/ } static int rings_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct rings_reply_data *data = RINGS_REPDATA(reply_base); const struct kernel_ethtool_ringparam *kr = &data->kernel_ringparam; const struct ethtool_ringparam *ringparam = &data->ringparam; u32 supported_ring_params = data->supported_ring_params; WARN_ON(kr->tcp_data_split > ETHTOOL_TCP_DATA_SPLIT_ENABLED); if ((ringparam->rx_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MAX, ringparam->rx_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX, ringparam->rx_pending))) || (ringparam->rx_mini_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI_MAX, ringparam->rx_mini_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI, ringparam->rx_mini_pending))) || (ringparam->rx_jumbo_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO_MAX, ringparam->rx_jumbo_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO, ringparam->rx_jumbo_pending))) || (ringparam->tx_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_MAX, ringparam->tx_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX, ringparam->tx_pending))) || (kr->rx_buf_len && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, kr->rx_buf_len))) || (kr->tcp_data_split && (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT, kr->tcp_data_split))) || (kr->cqe_size && (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size))) || nla_put_u8(skb, ETHTOOL_A_RINGS_TX_PUSH, !!kr->tx_push) || nla_put_u8(skb, ETHTOOL_A_RINGS_RX_PUSH, !!kr->rx_push) || ((supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN) && (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, kr->tx_push_buf_max_len) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, kr->tx_push_buf_len))) || ((supported_ring_params & ETHTOOL_RING_USE_HDS_THRS) && (nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH, kr->hds_thresh) || nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH_MAX, kr->hds_thresh_max)))) return -EMSGSIZE; return 0; } /* RINGS_SET */ const struct nla_policy ethnl_rings_set_policy[] = { [ETHTOOL_A_RINGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_RINGS_RX] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_MINI] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_TX] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_BUF_LEN] = NLA_POLICY_MIN(NLA_U32, 1), [ETHTOOL_A_RINGS_TCP_DATA_SPLIT] = NLA_POLICY_MAX(NLA_U8, ETHTOOL_TCP_DATA_SPLIT_ENABLED), [ETHTOOL_A_RINGS_CQE_SIZE] = NLA_POLICY_MIN(NLA_U32, 1), [ETHTOOL_A_RINGS_TX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_RX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_HDS_THRESH] = { .type = NLA_U32 }, }; static int ethnl_set_rings_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct nlattr **tb = info->attrs; if (tb[ETHTOOL_A_RINGS_RX_BUF_LEN] && !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_BUF_LEN)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], "setting rx buf len not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TCP_DATA_SPLIT)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], "setting TCP data split is not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_HDS_THRESH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_HDS_THRS)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_HDS_THRESH], "setting hds-thresh is not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_CQE_SIZE] && !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_CQE_SIZE], "setting cqe size not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TX_PUSH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH], "setting tx push not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_RX_PUSH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_PUSH)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_RX_PUSH], "setting rx push not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], "setting tx push buf len is not supported"); return -EOPNOTSUPP; } return ops->get_ringparam && ops->set_ringparam ? 1 : -EOPNOTSUPP; } static int ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) { struct kernel_ethtool_ringparam kernel_ringparam; struct net_device *dev = req_info->dev; struct ethtool_ringparam ringparam; struct nlattr **tb = info->attrs; const struct nlattr *err_attr; bool mod = false; int ret; ethtool_ringparam_get_cfg(dev, &ringparam, &kernel_ringparam, info->extack); ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod); ethnl_update_u32(&ringparam.rx_mini_pending, tb[ETHTOOL_A_RINGS_RX_MINI], &mod); ethnl_update_u32(&ringparam.rx_jumbo_pending, tb[ETHTOOL_A_RINGS_RX_JUMBO], &mod); ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod); ethnl_update_u32(&kernel_ringparam.rx_buf_len, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], &mod); ethnl_update_u8(&kernel_ringparam.tcp_data_split, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], &mod); ethnl_update_u32(&kernel_ringparam.cqe_size, tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod); ethnl_update_u8(&kernel_ringparam.tx_push, tb[ETHTOOL_A_RINGS_TX_PUSH], &mod); ethnl_update_u8(&kernel_ringparam.rx_push, tb[ETHTOOL_A_RINGS_RX_PUSH], &mod); ethnl_update_u32(&kernel_ringparam.tx_push_buf_len, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], &mod); ethnl_update_u32(&kernel_ringparam.hds_thresh, tb[ETHTOOL_A_RINGS_HDS_THRESH], &mod); if (!mod) return 0; if (kernel_ringparam.tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED && dev_xdp_sb_prog_count(dev)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], "tcp-data-split can not be enabled with single buffer XDP"); return -EINVAL; } if (dev_get_min_mp_channel_count(dev)) { if (kernel_ringparam.tcp_data_split != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { NL_SET_ERR_MSG(info->extack, "can't disable tcp-data-split while device has memory provider enabled"); return -EINVAL; } else if (kernel_ringparam.hds_thresh) { NL_SET_ERR_MSG(info->extack, "can't set non-zero hds_thresh while device is memory provider enabled"); return -EINVAL; } } /* ensure new ring parameters are within limits */ if (ringparam.rx_pending > ringparam.rx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX]; else if (ringparam.rx_mini_pending > ringparam.rx_mini_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX_MINI]; else if (ringparam.rx_jumbo_pending > ringparam.rx_jumbo_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX_JUMBO]; else if (ringparam.tx_pending > ringparam.tx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_TX]; else if (kernel_ringparam.hds_thresh > kernel_ringparam.hds_thresh_max) err_attr = tb[ETHTOOL_A_RINGS_HDS_THRESH]; else err_attr = NULL; if (err_attr) { NL_SET_ERR_MSG_ATTR(info->extack, err_attr, "requested ring size exceeds maximum"); return -EINVAL; } if (kernel_ringparam.tx_push_buf_len > kernel_ringparam.tx_push_buf_max_len) { NL_SET_ERR_MSG_ATTR_FMT(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], "Requested TX push buffer exceeds the maximum of %u", kernel_ringparam.tx_push_buf_max_len); return -EINVAL; } dev->cfg_pending->hds_config = kernel_ringparam.tcp_data_split; dev->cfg_pending->hds_thresh = kernel_ringparam.hds_thresh; ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_rings_request_ops = { .request_cmd = ETHTOOL_MSG_RINGS_GET, .reply_cmd = ETHTOOL_MSG_RINGS_GET_REPLY, .hdr_attr = ETHTOOL_A_RINGS_HEADER, .req_info_size = sizeof(struct rings_req_info), .reply_data_size = sizeof(struct rings_reply_data), .prepare_data = rings_prepare_data, .reply_size = rings_reply_size, .fill_reply = rings_fill_reply, .set_validate = ethnl_set_rings_validate, .set = ethnl_set_rings, .set_ntf_cmd = ETHTOOL_MSG_RINGS_NTF, };
2 1 4 14 5 5 4 4 3 2 4 2 2 1 1 1 1 1 5 5 3 8 8 8 8 8 6 2 6 4 4 3 8 2 5 1 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" static inline bool devlink_rate_is_leaf(struct devlink_rate *devlink_rate) { return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF; } static inline bool devlink_rate_is_node(struct devlink_rate *devlink_rate) { return devlink_rate->type == DEVLINK_RATE_TYPE_NODE; } static struct devlink_rate * devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) { struct devlink_rate *devlink_rate; struct devlink_port *devlink_port; devlink_port = devlink_port_get_from_attrs(devlink, info->attrs); if (IS_ERR(devlink_port)) return ERR_CAST(devlink_port); devlink_rate = devlink_port->devlink_rate; return devlink_rate ?: ERR_PTR(-ENODEV); } static struct devlink_rate * devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) { struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate) && !strcmp(node_name, devlink_rate->name)) return devlink_rate; } return ERR_PTR(-ENODEV); } static struct devlink_rate * devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { const char *rate_node_name; size_t len; if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME]) return ERR_PTR(-EINVAL); rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]); len = strlen(rate_node_name); /* Name cannot be empty or decimal number */ if (!len || strspn(rate_node_name, "0123456789") == len) return ERR_PTR(-EINVAL); return devlink_rate_node_get_by_name(devlink, rate_node_name); } static struct devlink_rate * devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_rate_node_get_from_attrs(devlink, info->attrs); } static struct devlink_rate * devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) { struct nlattr **attrs = info->attrs; if (attrs[DEVLINK_ATTR_PORT_INDEX]) return devlink_rate_leaf_get_from_info(devlink, info); else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME]) return devlink_rate_node_get_from_info(devlink, info); else return ERR_PTR(-EINVAL); } static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw) { struct nlattr *nla_tc_bw; int i; for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BWS); if (!nla_tc_bw) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_RATE_TC_ATTR_INDEX, i) || nla_put_u32(msg, DEVLINK_RATE_TC_ATTR_BW, tc_bw[i])) goto nla_put_failure; nla_nest_end(msg, nla_tc_bw); } return 0; nla_put_failure: nla_nest_cancel(msg, nla_tc_bw); return -EMSGSIZE; } static int devlink_nl_rate_fill(struct sk_buff *msg, struct devlink_rate *devlink_rate, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct devlink *devlink = devlink_rate->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type)) goto nla_put_failure; if (devlink_rate_is_leaf(devlink_rate)) { if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_rate->devlink_port->index)) goto nla_put_failure; } else if (devlink_rate_is_node(devlink_rate)) { if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME, devlink_rate->name)) goto nla_put_failure; } if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_SHARE, devlink_rate->tx_share)) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_MAX, devlink_rate->tx_max)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY, devlink_rate->tx_priority)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT, devlink_rate->tx_weight)) goto nla_put_failure; if (devlink_rate->parent) if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, devlink_rate->parent->name)) goto nla_put_failure; if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_rate_notify(struct devlink_rate *devlink_rate, enum devlink_command cmd) { struct devlink *devlink = devlink_rate->devlink; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL); if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_rates_notify_register(struct devlink *devlink) { struct devlink_rate *rate_node; list_for_each_entry(rate_node, &devlink->rate_list, list) devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); } void devlink_rates_notify_unregister(struct devlink *devlink) { struct devlink_rate *rate_node; list_for_each_entry_reverse(rate_node, &devlink->rate_list, list) devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); } static int devlink_nl_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_rate *devlink_rate; int idx = 0; int err = 0; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; u32 id = NETLINK_CB(cb->skb).portid; if (idx < state->idx) { idx++; continue; } err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, cb->nlh->nlmsg_seq, flags, NULL); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_rate_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_rate_get_dump_one); } int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *devlink_rate; struct sk_buff *msg; int err; devlink_rate = devlink_rate_get_from_info(devlink, info); if (IS_ERR(devlink_rate)) return PTR_ERR(devlink_rate); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static bool devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, struct devlink_rate *parent) { while (parent) { if (parent == devlink_rate) return true; parent = parent->parent; } return false; } static int devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, struct genl_info *info, struct nlattr *nla_parent) { struct devlink *devlink = devlink_rate->devlink; const char *parent_name = nla_data(nla_parent); const struct devlink_ops *ops = devlink->ops; size_t len = strlen(parent_name); struct devlink_rate *parent; int err = -EOPNOTSUPP; parent = devlink_rate->parent; if (parent && !len) { if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, info->extack); if (err) return err; refcount_dec(&parent->refcnt); devlink_rate->parent = NULL; } else if (len) { parent = devlink_rate_node_get_by_name(devlink, parent_name); if (IS_ERR(parent)) return -ENODEV; if (parent == devlink_rate) { NL_SET_ERR_MSG(info->extack, "Parent to self is not allowed"); return -EINVAL; } if (devlink_rate_is_node(devlink_rate) && devlink_rate_is_parent_node(devlink_rate, parent->parent)) { NL_SET_ERR_MSG(info->extack, "Node is already a parent of parent node."); return -EEXIST; } if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_parent_set(devlink_rate, parent, devlink_rate->priv, parent->priv, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_parent_set(devlink_rate, parent, devlink_rate->priv, parent->priv, info->extack); if (err) return err; if (devlink_rate->parent) /* we're reassigning to other parent in this case */ refcount_dec(&devlink_rate->parent->refcnt); refcount_inc(&parent->refcnt); devlink_rate->parent = parent; } return 0; } static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw, unsigned long *bitmap, struct netlink_ext_ack *extack) { struct nlattr *tb[DEVLINK_RATE_TC_ATTR_MAX + 1]; u8 tc_index; int err; err = nla_parse_nested(tb, DEVLINK_RATE_TC_ATTR_MAX, parent_nest, devlink_dl_rate_tc_bws_nl_policy, extack); if (err) return err; if (!tb[DEVLINK_RATE_TC_ATTR_INDEX]) { NL_SET_ERR_ATTR_MISS(extack, parent_nest, DEVLINK_RATE_TC_ATTR_INDEX); return -EINVAL; } tc_index = nla_get_u8(tb[DEVLINK_RATE_TC_ATTR_INDEX]); if (!tb[DEVLINK_RATE_TC_ATTR_BW]) { NL_SET_ERR_ATTR_MISS(extack, parent_nest, DEVLINK_RATE_TC_ATTR_BW); return -EINVAL; } if (test_and_set_bit(tc_index, bitmap)) { NL_SET_ERR_MSG_FMT(extack, "Duplicate traffic class index specified (%u)", tc_index); return -EINVAL; } tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_RATE_TC_ATTR_BW]); return 0; } static int devlink_nl_rate_tc_bw_set(struct devlink_rate *devlink_rate, struct genl_info *info) { DECLARE_BITMAP(bitmap, DEVLINK_RATE_TCS_MAX) = {}; struct devlink *devlink = devlink_rate->devlink; const struct devlink_ops *ops = devlink->ops; u32 tc_bw[DEVLINK_RATE_TCS_MAX] = {}; int rem, err = -EOPNOTSUPP, i; struct nlattr *attr; nlmsg_for_each_attr_type(attr, DEVLINK_ATTR_RATE_TC_BWS, info->nlhdr, GENL_HDRLEN, rem) { err = devlink_nl_rate_tc_bw_parse(attr, tc_bw, bitmap, info->extack); if (err) return err; } for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { if (!test_bit(i, bitmap)) { NL_SET_ERR_MSG_FMT(info->extack, "Bandwidth values must be specified for all %u traffic classes", DEVLINK_RATE_TCS_MAX); return -EINVAL; } } if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv, tc_bw, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv, tc_bw, info->extack); if (err) return err; memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw)); return 0; } static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, const struct devlink_ops *ops, struct genl_info *info) { struct nlattr *nla_parent, **attrs = info->attrs; int err = -EOPNOTSUPP; u32 priority; u32 weight; u64 rate; if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv, rate, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv, rate, info->extack); if (err) return err; devlink_rate->tx_share = rate; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) { rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv, rate, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv, rate, info->extack); if (err) return err; devlink_rate->tx_max = rate; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) { priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv, priority, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv, priority, info->extack); if (err) return err; devlink_rate->tx_priority = priority; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) { weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv, weight, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv, weight, info->extack); if (err) return err; devlink_rate->tx_weight = weight; } nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; if (nla_parent) { err = devlink_nl_rate_parent_node_set(devlink_rate, info, nla_parent); if (err) return err; } if (attrs[DEVLINK_ATTR_RATE_TC_BWS]) { err = devlink_nl_rate_tc_bw_set(devlink_rate, info); if (err) return err; } return 0; } static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, struct genl_info *info, enum devlink_rate_type type) { struct nlattr **attrs = info->attrs; if (type == DEVLINK_RATE_TYPE_LEAF) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) { NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && !ops->rate_leaf_parent_set) { NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], "TX priority set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], "TX weight set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && !ops->rate_leaf_tc_bw_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TC_BWS], "TC bandwidth set isn't supported for the leafs"); return false; } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) { NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && !ops->rate_node_parent_set) { NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], "TX priority set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], "TX weight set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && !ops->rate_node_tc_bw_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TC_BWS], "TC bandwidth set isn't supported for the nodes"); return false; } } else { WARN(1, "Unknown type of rate object"); return false; } return true; } int devlink_nl_rate_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *devlink_rate; const struct devlink_ops *ops; int err; devlink_rate = devlink_rate_get_from_info(devlink, info); if (IS_ERR(devlink_rate)) return PTR_ERR(devlink_rate); ops = devlink->ops; if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type)) return -EOPNOTSUPP; err = devlink_nl_rate_set(devlink_rate, ops, info); if (!err) devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); return err; } int devlink_nl_rate_new_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *rate_node; const struct devlink_ops *ops; int err; ops = devlink->ops; if (!ops || !ops->rate_node_new || !ops->rate_node_del) { NL_SET_ERR_MSG(info->extack, "Rate nodes aren't supported"); return -EOPNOTSUPP; } if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE)) return -EOPNOTSUPP; rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs); if (!IS_ERR(rate_node)) return -EEXIST; else if (rate_node == ERR_PTR(-EINVAL)) return -EINVAL; rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); if (!rate_node) return -ENOMEM; rate_node->devlink = devlink; rate_node->type = DEVLINK_RATE_TYPE_NODE; rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL); if (!rate_node->name) { err = -ENOMEM; goto err_strdup; } err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack); if (err) goto err_node_new; err = devlink_nl_rate_set(rate_node, ops, info); if (err) goto err_rate_set; refcount_set(&rate_node->refcnt, 1); list_add(&rate_node->list, &devlink->rate_list); devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); return 0; err_rate_set: ops->rate_node_del(rate_node, rate_node->priv, info->extack); err_node_new: kfree(rate_node->name); err_strdup: kfree(rate_node); return err; } int devlink_nl_rate_del_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *rate_node; int err; rate_node = devlink_rate_node_get_from_info(devlink, info); if (IS_ERR(rate_node)) return PTR_ERR(rate_node); if (refcount_read(&rate_node->refcnt) > 1) { NL_SET_ERR_MSG(info->extack, "Node has children. Cannot delete node."); return -EBUSY; } devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); err = devlink->ops->rate_node_del(rate_node, rate_node->priv, info->extack); if (rate_node->parent) refcount_dec(&rate_node->parent->refcnt); list_del(&rate_node->list); kfree(rate_node->name); kfree(rate_node); return err; } int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack) { struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) if (devlink_rate_is_node(devlink_rate)) { NL_SET_ERR_MSG(extack, "Rate node(s) exists."); return -EBUSY; } return 0; } /** * devl_rate_node_create - create devlink rate node * @devlink: devlink instance * @priv: driver private data * @node_name: name of the resulting node * @parent: parent devlink_rate struct * * Create devlink rate object of type node */ struct devlink_rate * devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, struct devlink_rate *parent) { struct devlink_rate *rate_node; rate_node = devlink_rate_node_get_by_name(devlink, node_name); if (!IS_ERR(rate_node)) return ERR_PTR(-EEXIST); rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); if (!rate_node) return ERR_PTR(-ENOMEM); if (parent) { rate_node->parent = parent; refcount_inc(&rate_node->parent->refcnt); } rate_node->type = DEVLINK_RATE_TYPE_NODE; rate_node->devlink = devlink; rate_node->priv = priv; rate_node->name = kstrdup(node_name, GFP_KERNEL); if (!rate_node->name) { kfree(rate_node); return ERR_PTR(-ENOMEM); } refcount_set(&rate_node->refcnt, 1); list_add(&rate_node->list, &devlink->rate_list); devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); return rate_node; } EXPORT_SYMBOL_GPL(devl_rate_node_create); /** * devl_rate_leaf_create - create devlink rate leaf * @devlink_port: devlink port object to create rate object on * @priv: driver private data * @parent: parent devlink_rate struct * * Create devlink rate object of type leaf on provided @devlink_port. */ int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, struct devlink_rate *parent) { struct devlink *devlink = devlink_port->devlink; struct devlink_rate *devlink_rate; devl_assert_locked(devlink_port->devlink); if (WARN_ON(devlink_port->devlink_rate)) return -EBUSY; devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); if (!devlink_rate) return -ENOMEM; if (parent) { devlink_rate->parent = parent; refcount_inc(&devlink_rate->parent->refcnt); } devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; devlink_rate->devlink = devlink; devlink_rate->devlink_port = devlink_port; devlink_rate->priv = priv; list_add_tail(&devlink_rate->list, &devlink->rate_list); devlink_port->devlink_rate = devlink_rate; devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); return 0; } EXPORT_SYMBOL_GPL(devl_rate_leaf_create); /** * devl_rate_leaf_destroy - destroy devlink rate leaf * * @devlink_port: devlink port linked to the rate object * * Destroy the devlink rate object of type leaf on provided @devlink_port. */ void devl_rate_leaf_destroy(struct devlink_port *devlink_port) { struct devlink_rate *devlink_rate = devlink_port->devlink_rate; devl_assert_locked(devlink_port->devlink); if (!devlink_rate) return; devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); if (devlink_rate->parent) refcount_dec(&devlink_rate->parent->refcnt); list_del(&devlink_rate->list); devlink_port->devlink_rate = NULL; kfree(devlink_rate); } EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); /** * devl_rate_nodes_destroy - destroy all devlink rate nodes on device * @devlink: devlink instance * * Unset parent for all rate objects and destroy all rate nodes * on specified device. */ void devl_rate_nodes_destroy(struct devlink *devlink) { const struct devlink_ops *ops = devlink->ops; struct devlink_rate *devlink_rate, *tmp; devl_assert_locked(devlink); list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (!devlink_rate->parent) continue; refcount_dec(&devlink_rate->parent->refcnt); if (devlink_rate_is_leaf(devlink_rate)) ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); else if (devlink_rate_is_node(devlink_rate)) ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); } list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate)) { ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL); list_del(&devlink_rate->list); kfree(devlink_rate->name); kfree(devlink_rate); } } } EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy);
200 200 117 21 117 6 117 27 20 191 124 8 199 199 200 200 200 186 191 141 9 141 141 117 124 2 2 198 200 197 21 2 21 123 21 199 198 200 196 20 200 187 15 200 203 96 146 85 146 9 4 72 128 1 199 122 8 2 2 2 2 1 3 2 1 4 90 182 106 81 9 2 2 1 1 1 3 2 1 3 200 199 2 32 167 161 4 165 3 200 3 199 200 198 1 199 195 190 189 21 200 1 18 51 201 14 2 198 200 200 199 200 198 2 200 2 2 200 2 197 199 200 199 2 68 75 96 172 38 106 131 2 197 70 145 181 181 21 1 76 38 24 75 69 4 156 97 36 9 79 83 81 3 87 88 4 4 47 47 9 47 6 33 33 4 4 32 1 32 4 14 1 22 43 2 1 17 17 6 2 3 1 38 4 37 22 40 20 5 12 3 5 2 23 35 2 2 24 7 21 1 20 16 6 13 15 30 6 3 6 5 1 1 3 9 9 7 2 1 1 1 52 16 2 1 34 32 9 24 26 5 34 3 24 12 2 26 22 15 2 20 20 4 17 2 23 1 10 5 19 18 3 17 1 38 3 2 36 2 2 2 2 36 1 2 13 3 35 35 2 33 2 4 19 28 16 7 24 4 5 5 4 6 15 9 1 14 1 1 1 2 2 3 3 3 53 40 14 3 50 36 3 1 1 4 8 10 2 7 34 4 5 38 3 3 3 7 1 1 1 1 5 11 1 1 1 2 3 3 6 4 4 1 3 4 4 2 2 11 1 2 6 1 4 2 2 2 2 1 4 24 3 24 24 24 32 32 23 27 32 1 18 7 18 4 4 23 22 1 25 8 16 10 15 25 25 32 16 16 32 31 212 1 1 1 9 26 6 20 1 6 17 2 2 1 2 6 1 1 1 5 33 1 1 7 11 1 5 1 13 16 15 1 2 1 1 3 18 2 3 5 1 37 37 44 45 21 49 14 35 35 35 7 42 42 42 21 24 37 14 2 9 3 13 1 2 9 1 3 2 1 2 6 20 23 23 20 23 22 23 6 17 23 23 21 20 17 17 17 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 // SPDX-License-Identifier: GPL-2.0-or-later /* * Digital Audio (PCM) abstract layer / OSS compatible * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #if 0 #define PLUGIN_DEBUG #endif #if 0 #define OSS_DEBUG #endif #include <linux/init.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/time.h> #include <linux/vmalloc.h> #include <linux/module.h> #include <linux/math64.h> #include <linux/string.h> #include <linux/compat.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/pcm.h> #include <sound/pcm_params.h> #include "pcm_plugin.h" #include <sound/info.h> #include <linux/soundcard.h> #include <sound/initval.h> #include <sound/mixer_oss.h> #define OSS_ALSAEMULVER _SIOR ('M', 249, int) static int dsp_map[SNDRV_CARDS]; static int adsp_map[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS-1)] = 1}; static bool nonblock_open = 1; MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>, Abramo Bagnara <abramo@alsa-project.org>"); MODULE_DESCRIPTION("PCM OSS emulation for ALSA."); MODULE_LICENSE("GPL"); module_param_array(dsp_map, int, NULL, 0444); MODULE_PARM_DESC(dsp_map, "PCM device number assigned to 1st OSS device."); module_param_array(adsp_map, int, NULL, 0444); MODULE_PARM_DESC(adsp_map, "PCM device number assigned to 2nd OSS device."); module_param(nonblock_open, bool, 0644); MODULE_PARM_DESC(nonblock_open, "Don't block opening busy PCM devices."); MODULE_ALIAS_SNDRV_MINOR(SNDRV_MINOR_OSS_PCM); MODULE_ALIAS_SNDRV_MINOR(SNDRV_MINOR_OSS_PCM1); static int snd_pcm_oss_get_rate(struct snd_pcm_oss_file *pcm_oss_file); static int snd_pcm_oss_get_channels(struct snd_pcm_oss_file *pcm_oss_file); static int snd_pcm_oss_get_format(struct snd_pcm_oss_file *pcm_oss_file); /* * helper functions to process hw_params */ static int snd_interval_refine_min(struct snd_interval *i, unsigned int min, int openmin) { int changed = 0; if (i->min < min) { i->min = min; i->openmin = openmin; changed = 1; } else if (i->min == min && !i->openmin && openmin) { i->openmin = 1; changed = 1; } if (i->integer) { if (i->openmin) { i->min++; i->openmin = 0; } } if (snd_interval_checkempty(i)) { snd_interval_none(i); return -EINVAL; } return changed; } static int snd_interval_refine_max(struct snd_interval *i, unsigned int max, int openmax) { int changed = 0; if (i->max > max) { i->max = max; i->openmax = openmax; changed = 1; } else if (i->max == max && !i->openmax && openmax) { i->openmax = 1; changed = 1; } if (i->integer) { if (i->openmax) { i->max--; i->openmax = 0; } } if (snd_interval_checkempty(i)) { snd_interval_none(i); return -EINVAL; } return changed; } static int snd_interval_refine_set(struct snd_interval *i, unsigned int val) { struct snd_interval t; t.empty = 0; t.min = t.max = val; t.openmin = t.openmax = 0; t.integer = 1; return snd_interval_refine(i, &t); } /** * snd_pcm_hw_param_value_min * @params: the hw_params instance * @var: parameter to retrieve * @dir: pointer to the direction (-1,0,1) or NULL * * Return the minimum value for field PAR. */ static unsigned int snd_pcm_hw_param_value_min(const struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, int *dir) { if (hw_is_mask(var)) { if (dir) *dir = 0; return snd_mask_min(hw_param_mask_c(params, var)); } if (hw_is_interval(var)) { const struct snd_interval *i = hw_param_interval_c(params, var); if (dir) *dir = i->openmin; return snd_interval_min(i); } return -EINVAL; } /** * snd_pcm_hw_param_value_max * @params: the hw_params instance * @var: parameter to retrieve * @dir: pointer to the direction (-1,0,1) or NULL * * Return the maximum value for field PAR. */ static int snd_pcm_hw_param_value_max(const struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, int *dir) { if (hw_is_mask(var)) { if (dir) *dir = 0; return snd_mask_max(hw_param_mask_c(params, var)); } if (hw_is_interval(var)) { const struct snd_interval *i = hw_param_interval_c(params, var); if (dir) *dir = - (int) i->openmax; return snd_interval_max(i); } return -EINVAL; } static int _snd_pcm_hw_param_mask(struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, const struct snd_mask *val) { int changed; changed = snd_mask_refine(hw_param_mask(params, var), val); if (changed > 0) { params->cmask |= 1 << var; params->rmask |= 1 << var; } return changed; } static int snd_pcm_hw_param_mask(struct snd_pcm_substream *pcm, struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, const struct snd_mask *val) { int changed = _snd_pcm_hw_param_mask(params, var, val); if (changed < 0) return changed; if (params->rmask) { int err = snd_pcm_hw_refine(pcm, params); if (err < 0) return err; } return 0; } static int _snd_pcm_hw_param_min(struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, unsigned int val, int dir) { int changed; int open = 0; if (dir) { if (dir > 0) { open = 1; } else if (dir < 0) { if (val > 0) { open = 1; val--; } } } if (hw_is_mask(var)) changed = snd_mask_refine_min(hw_param_mask(params, var), val + !!open); else if (hw_is_interval(var)) changed = snd_interval_refine_min(hw_param_interval(params, var), val, open); else return -EINVAL; if (changed > 0) { params->cmask |= 1 << var; params->rmask |= 1 << var; } return changed; } /** * snd_pcm_hw_param_min * @pcm: PCM instance * @params: the hw_params instance * @var: parameter to retrieve * @val: minimal value * @dir: pointer to the direction (-1,0,1) or NULL * * Inside configuration space defined by PARAMS remove from PAR all * values < VAL. Reduce configuration space accordingly. * Return new minimum or -EINVAL if the configuration space is empty */ static int snd_pcm_hw_param_min(struct snd_pcm_substream *pcm, struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, unsigned int val, int *dir) { int changed = _snd_pcm_hw_param_min(params, var, val, dir ? *dir : 0); if (changed < 0) return changed; if (params->rmask) { int err = snd_pcm_hw_refine(pcm, params); if (err < 0) return err; } return snd_pcm_hw_param_value_min(params, var, dir); } static int _snd_pcm_hw_param_max(struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, unsigned int val, int dir) { int changed; int open = 0; if (dir) { if (dir < 0) { open = 1; } else if (dir > 0) { open = 1; val++; } } if (hw_is_mask(var)) { if (val == 0 && open) { snd_mask_none(hw_param_mask(params, var)); changed = -EINVAL; } else changed = snd_mask_refine_max(hw_param_mask(params, var), val - !!open); } else if (hw_is_interval(var)) changed = snd_interval_refine_max(hw_param_interval(params, var), val, open); else return -EINVAL; if (changed > 0) { params->cmask |= 1 << var; params->rmask |= 1 << var; } return changed; } /** * snd_pcm_hw_param_max * @pcm: PCM instance * @params: the hw_params instance * @var: parameter to retrieve * @val: maximal value * @dir: pointer to the direction (-1,0,1) or NULL * * Inside configuration space defined by PARAMS remove from PAR all * values >= VAL + 1. Reduce configuration space accordingly. * Return new maximum or -EINVAL if the configuration space is empty */ static int snd_pcm_hw_param_max(struct snd_pcm_substream *pcm, struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, unsigned int val, int *dir) { int changed = _snd_pcm_hw_param_max(params, var, val, dir ? *dir : 0); if (changed < 0) return changed; if (params->rmask) { int err = snd_pcm_hw_refine(pcm, params); if (err < 0) return err; } return snd_pcm_hw_param_value_max(params, var, dir); } static int boundary_sub(int a, int adir, int b, int bdir, int *c, int *cdir) { adir = adir < 0 ? -1 : (adir > 0 ? 1 : 0); bdir = bdir < 0 ? -1 : (bdir > 0 ? 1 : 0); *c = a - b; *cdir = adir - bdir; if (*cdir == -2) { (*c)--; } else if (*cdir == 2) { (*c)++; } return 0; } static int boundary_lt(unsigned int a, int adir, unsigned int b, int bdir) { if (adir < 0) { a--; adir = 1; } else if (adir > 0) adir = 1; if (bdir < 0) { b--; bdir = 1; } else if (bdir > 0) bdir = 1; return a < b || (a == b && adir < bdir); } /* Return 1 if min is nearer to best than max */ static int boundary_nearer(int min, int mindir, int best, int bestdir, int max, int maxdir) { int dmin, dmindir; int dmax, dmaxdir; boundary_sub(best, bestdir, min, mindir, &dmin, &dmindir); boundary_sub(max, maxdir, best, bestdir, &dmax, &dmaxdir); return boundary_lt(dmin, dmindir, dmax, dmaxdir); } /** * snd_pcm_hw_param_near * @pcm: PCM instance * @params: the hw_params instance * @var: parameter to retrieve * @best: value to set * @dir: pointer to the direction (-1,0,1) or NULL * * Inside configuration space defined by PARAMS set PAR to the available value * nearest to VAL. Reduce configuration space accordingly. * This function cannot be called for SNDRV_PCM_HW_PARAM_ACCESS, * SNDRV_PCM_HW_PARAM_FORMAT, SNDRV_PCM_HW_PARAM_SUBFORMAT. * Return the value found. */ static int snd_pcm_hw_param_near(struct snd_pcm_substream *pcm, struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, unsigned int best, int *dir) { struct snd_pcm_hw_params *save __free(kfree) = NULL; int v; unsigned int saved_min; int last = 0; int min, max; int mindir, maxdir; int valdir = dir ? *dir : 0; /* FIXME */ if (best > INT_MAX) best = INT_MAX; min = max = best; mindir = maxdir = valdir; if (maxdir > 0) maxdir = 0; else if (maxdir == 0) maxdir = -1; else { maxdir = 1; max--; } save = kmalloc(sizeof(*save), GFP_KERNEL); if (save == NULL) return -ENOMEM; *save = *params; saved_min = min; min = snd_pcm_hw_param_min(pcm, params, var, min, &mindir); if (min >= 0) { struct snd_pcm_hw_params *params1 __free(kfree) = NULL; if (max < 0) goto _end; if ((unsigned int)min == saved_min && mindir == valdir) goto _end; params1 = kmalloc(sizeof(*params1), GFP_KERNEL); if (params1 == NULL) return -ENOMEM; *params1 = *save; max = snd_pcm_hw_param_max(pcm, params1, var, max, &maxdir); if (max < 0) goto _end; if (boundary_nearer(max, maxdir, best, valdir, min, mindir)) { *params = *params1; last = 1; } } else { *params = *save; max = snd_pcm_hw_param_max(pcm, params, var, max, &maxdir); if (max < 0) return max; last = 1; } _end: if (last) v = snd_pcm_hw_param_last(pcm, params, var, dir); else v = snd_pcm_hw_param_first(pcm, params, var, dir); return v; } static int _snd_pcm_hw_param_set(struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, unsigned int val, int dir) { int changed; if (hw_is_mask(var)) { struct snd_mask *m = hw_param_mask(params, var); if (val == 0 && dir < 0) { changed = -EINVAL; snd_mask_none(m); } else { if (dir > 0) val++; else if (dir < 0) val--; changed = snd_mask_refine_set(hw_param_mask(params, var), val); } } else if (hw_is_interval(var)) { struct snd_interval *i = hw_param_interval(params, var); if (val == 0 && dir < 0) { changed = -EINVAL; snd_interval_none(i); } else if (dir == 0) changed = snd_interval_refine_set(i, val); else { struct snd_interval t; t.openmin = 1; t.openmax = 1; t.empty = 0; t.integer = 0; if (dir < 0) { t.min = val - 1; t.max = val; } else { t.min = val; t.max = val+1; } changed = snd_interval_refine(i, &t); } } else return -EINVAL; if (changed > 0) { params->cmask |= 1 << var; params->rmask |= 1 << var; } return changed; } /** * snd_pcm_hw_param_set * @pcm: PCM instance * @params: the hw_params instance * @var: parameter to retrieve * @val: value to set * @dir: pointer to the direction (-1,0,1) or NULL * * Inside configuration space defined by PARAMS remove from PAR all * values != VAL. Reduce configuration space accordingly. * Return VAL or -EINVAL if the configuration space is empty */ static int snd_pcm_hw_param_set(struct snd_pcm_substream *pcm, struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, unsigned int val, int dir) { int changed = _snd_pcm_hw_param_set(params, var, val, dir); if (changed < 0) return changed; if (params->rmask) { int err = snd_pcm_hw_refine(pcm, params); if (err < 0) return err; } return snd_pcm_hw_param_value(params, var, NULL); } static int _snd_pcm_hw_param_setinteger(struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var) { int changed; changed = snd_interval_setinteger(hw_param_interval(params, var)); if (changed > 0) { params->cmask |= 1 << var; params->rmask |= 1 << var; } return changed; } /* * plugin */ #ifdef CONFIG_SND_PCM_OSS_PLUGINS static int snd_pcm_oss_plugin_clear(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct snd_pcm_plugin *plugin, *next; plugin = runtime->oss.plugin_first; while (plugin) { next = plugin->next; snd_pcm_plugin_free(plugin); plugin = next; } runtime->oss.plugin_first = runtime->oss.plugin_last = NULL; return 0; } static int snd_pcm_plugin_insert(struct snd_pcm_plugin *plugin) { struct snd_pcm_runtime *runtime = plugin->plug->runtime; plugin->next = runtime->oss.plugin_first; plugin->prev = NULL; if (runtime->oss.plugin_first) { runtime->oss.plugin_first->prev = plugin; runtime->oss.plugin_first = plugin; } else { runtime->oss.plugin_last = runtime->oss.plugin_first = plugin; } return 0; } int snd_pcm_plugin_append(struct snd_pcm_plugin *plugin) { struct snd_pcm_runtime *runtime = plugin->plug->runtime; plugin->next = NULL; plugin->prev = runtime->oss.plugin_last; if (runtime->oss.plugin_last) { runtime->oss.plugin_last->next = plugin; runtime->oss.plugin_last = plugin; } else { runtime->oss.plugin_last = runtime->oss.plugin_first = plugin; } return 0; } #endif /* CONFIG_SND_PCM_OSS_PLUGINS */ static long snd_pcm_oss_bytes(struct snd_pcm_substream *substream, long frames) { struct snd_pcm_runtime *runtime = substream->runtime; long buffer_size = snd_pcm_lib_buffer_bytes(substream); long bytes = frames_to_bytes(runtime, frames); if (buffer_size == runtime->oss.buffer_bytes) return bytes; #if BITS_PER_LONG >= 64 return runtime->oss.buffer_bytes * bytes / buffer_size; #else { u64 bsize = (u64)runtime->oss.buffer_bytes * (u64)bytes; return div_u64(bsize, buffer_size); } #endif } static long snd_pcm_alsa_frames(struct snd_pcm_substream *substream, long bytes) { struct snd_pcm_runtime *runtime = substream->runtime; long buffer_size = snd_pcm_lib_buffer_bytes(substream); if (buffer_size == runtime->oss.buffer_bytes) return bytes_to_frames(runtime, bytes); return bytes_to_frames(runtime, (buffer_size * bytes) / runtime->oss.buffer_bytes); } static inline snd_pcm_uframes_t get_hw_ptr_period(struct snd_pcm_runtime *runtime) { return runtime->hw_ptr_interrupt; } /* define extended formats in the recent OSS versions (if any) */ /* linear formats */ #define AFMT_S32_LE 0x00001000 #define AFMT_S32_BE 0x00002000 #define AFMT_S24_LE 0x00008000 #define AFMT_S24_BE 0x00010000 #define AFMT_S24_PACKED 0x00040000 /* other supported formats */ #define AFMT_FLOAT 0x00004000 #define AFMT_SPDIF_RAW 0x00020000 /* unsupported formats */ #define AFMT_AC3 0x00000400 #define AFMT_VORBIS 0x00000800 static snd_pcm_format_t snd_pcm_oss_format_from(int format) { switch (format) { case AFMT_MU_LAW: return SNDRV_PCM_FORMAT_MU_LAW; case AFMT_A_LAW: return SNDRV_PCM_FORMAT_A_LAW; case AFMT_IMA_ADPCM: return SNDRV_PCM_FORMAT_IMA_ADPCM; case AFMT_U8: return SNDRV_PCM_FORMAT_U8; case AFMT_S16_LE: return SNDRV_PCM_FORMAT_S16_LE; case AFMT_S16_BE: return SNDRV_PCM_FORMAT_S16_BE; case AFMT_S8: return SNDRV_PCM_FORMAT_S8; case AFMT_U16_LE: return SNDRV_PCM_FORMAT_U16_LE; case AFMT_U16_BE: return SNDRV_PCM_FORMAT_U16_BE; case AFMT_MPEG: return SNDRV_PCM_FORMAT_MPEG; case AFMT_S32_LE: return SNDRV_PCM_FORMAT_S32_LE; case AFMT_S32_BE: return SNDRV_PCM_FORMAT_S32_BE; case AFMT_S24_LE: return SNDRV_PCM_FORMAT_S24_LE; case AFMT_S24_BE: return SNDRV_PCM_FORMAT_S24_BE; case AFMT_S24_PACKED: return SNDRV_PCM_FORMAT_S24_3LE; case AFMT_FLOAT: return SNDRV_PCM_FORMAT_FLOAT; case AFMT_SPDIF_RAW: return SNDRV_PCM_FORMAT_IEC958_SUBFRAME; default: return SNDRV_PCM_FORMAT_U8; } } static int snd_pcm_oss_format_to(snd_pcm_format_t format) { switch (format) { case SNDRV_PCM_FORMAT_MU_LAW: return AFMT_MU_LAW; case SNDRV_PCM_FORMAT_A_LAW: return AFMT_A_LAW; case SNDRV_PCM_FORMAT_IMA_ADPCM: return AFMT_IMA_ADPCM; case SNDRV_PCM_FORMAT_U8: return AFMT_U8; case SNDRV_PCM_FORMAT_S16_LE: return AFMT_S16_LE; case SNDRV_PCM_FORMAT_S16_BE: return AFMT_S16_BE; case SNDRV_PCM_FORMAT_S8: return AFMT_S8; case SNDRV_PCM_FORMAT_U16_LE: return AFMT_U16_LE; case SNDRV_PCM_FORMAT_U16_BE: return AFMT_U16_BE; case SNDRV_PCM_FORMAT_MPEG: return AFMT_MPEG; case SNDRV_PCM_FORMAT_S32_LE: return AFMT_S32_LE; case SNDRV_PCM_FORMAT_S32_BE: return AFMT_S32_BE; case SNDRV_PCM_FORMAT_S24_LE: return AFMT_S24_LE; case SNDRV_PCM_FORMAT_S24_BE: return AFMT_S24_BE; case SNDRV_PCM_FORMAT_S24_3LE: return AFMT_S24_PACKED; case SNDRV_PCM_FORMAT_FLOAT: return AFMT_FLOAT; case SNDRV_PCM_FORMAT_IEC958_SUBFRAME: return AFMT_SPDIF_RAW; default: return -EINVAL; } } static int snd_pcm_oss_period_size(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *oss_params, struct snd_pcm_hw_params *slave_params) { ssize_t s; ssize_t oss_buffer_size; ssize_t oss_period_size, oss_periods; ssize_t min_period_size, max_period_size; struct snd_pcm_runtime *runtime = substream->runtime; size_t oss_frame_size; oss_frame_size = snd_pcm_format_physical_width(params_format(oss_params)) * params_channels(oss_params) / 8; oss_buffer_size = snd_pcm_hw_param_value_max(slave_params, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, NULL); if (oss_buffer_size <= 0) return -EINVAL; oss_buffer_size = snd_pcm_plug_client_size(substream, oss_buffer_size * oss_frame_size); if (oss_buffer_size <= 0) return -EINVAL; oss_buffer_size = rounddown_pow_of_two(oss_buffer_size); if (atomic_read(&substream->mmap_count)) { if (oss_buffer_size > runtime->oss.mmap_bytes) oss_buffer_size = runtime->oss.mmap_bytes; } if (substream->oss.setup.period_size > 16) oss_period_size = substream->oss.setup.period_size; else if (runtime->oss.fragshift) { oss_period_size = 1 << runtime->oss.fragshift; if (oss_period_size > oss_buffer_size / 2) oss_period_size = oss_buffer_size / 2; } else { int sd; size_t bytes_per_sec = params_rate(oss_params) * snd_pcm_format_physical_width(params_format(oss_params)) * params_channels(oss_params) / 8; oss_period_size = oss_buffer_size; do { oss_period_size /= 2; } while (oss_period_size > bytes_per_sec); if (runtime->oss.subdivision == 0) { sd = 4; if (oss_period_size / sd > 4096) sd *= 2; if (oss_period_size / sd < 4096) sd = 1; } else sd = runtime->oss.subdivision; oss_period_size /= sd; if (oss_period_size < 16) oss_period_size = 16; } min_period_size = snd_pcm_plug_client_size(substream, snd_pcm_hw_param_value_min(slave_params, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, NULL)); if (min_period_size > 0) { min_period_size *= oss_frame_size; min_period_size = roundup_pow_of_two(min_period_size); if (oss_period_size < min_period_size) oss_period_size = min_period_size; } max_period_size = snd_pcm_plug_client_size(substream, snd_pcm_hw_param_value_max(slave_params, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, NULL)); if (max_period_size > 0) { max_period_size *= oss_frame_size; max_period_size = rounddown_pow_of_two(max_period_size); if (oss_period_size > max_period_size) oss_period_size = max_period_size; } oss_periods = oss_buffer_size / oss_period_size; if (substream->oss.setup.periods > 1) oss_periods = substream->oss.setup.periods; s = snd_pcm_hw_param_value_max(slave_params, SNDRV_PCM_HW_PARAM_PERIODS, NULL); if (s > 0 && runtime->oss.maxfrags && s > runtime->oss.maxfrags) s = runtime->oss.maxfrags; if (oss_periods > s) oss_periods = s; s = snd_pcm_hw_param_value_min(slave_params, SNDRV_PCM_HW_PARAM_PERIODS, NULL); if (s < 2) s = 2; if (oss_periods < s) oss_periods = s; while (oss_period_size * oss_periods > oss_buffer_size) oss_period_size /= 2; if (oss_period_size < 16) return -EINVAL; /* don't allocate too large period; 1MB period must be enough */ if (oss_period_size > 1024 * 1024) return -ENOMEM; runtime->oss.period_bytes = oss_period_size; runtime->oss.period_frames = 1; runtime->oss.periods = oss_periods; return 0; } static int choose_rate(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, unsigned int best_rate) { const struct snd_interval *it; struct snd_pcm_hw_params *save __free(kfree) = NULL; unsigned int rate, prev; save = kmalloc(sizeof(*save), GFP_KERNEL); if (save == NULL) return -ENOMEM; *save = *params; it = hw_param_interval_c(save, SNDRV_PCM_HW_PARAM_RATE); /* try multiples of the best rate */ rate = best_rate; for (;;) { if (it->max < rate || (it->max == rate && it->openmax)) break; if (it->min < rate || (it->min == rate && !it->openmin)) { int ret; ret = snd_pcm_hw_param_set(substream, params, SNDRV_PCM_HW_PARAM_RATE, rate, 0); if (ret == (int)rate) return rate; *params = *save; } prev = rate; rate += best_rate; if (rate <= prev) break; } /* not found, use the nearest rate */ return snd_pcm_hw_param_near(substream, params, SNDRV_PCM_HW_PARAM_RATE, best_rate, NULL); } /* parameter locking: returns immediately if tried during streaming */ static int lock_params(struct snd_pcm_runtime *runtime) { if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -ERESTARTSYS; if (atomic_read(&runtime->oss.rw_ref)) { mutex_unlock(&runtime->oss.params_lock); return -EBUSY; } return 0; } static void unlock_params(struct snd_pcm_runtime *runtime) { mutex_unlock(&runtime->oss.params_lock); } static void snd_pcm_oss_release_buffers(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; kvfree(runtime->oss.buffer); runtime->oss.buffer = NULL; #ifdef CONFIG_SND_PCM_OSS_PLUGINS snd_pcm_oss_plugin_clear(substream); #endif } /* call with params_lock held */ static int snd_pcm_oss_change_params_locked(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct snd_pcm_hw_params *params, *sparams; struct snd_pcm_sw_params *sw_params; ssize_t oss_buffer_size, oss_period_size; size_t oss_frame_size; int err; int direct; snd_pcm_format_t format, sformat; int n; const struct snd_mask *sformat_mask; struct snd_mask mask; if (!runtime->oss.params) return 0; sw_params = kzalloc(sizeof(*sw_params), GFP_KERNEL); params = kmalloc(sizeof(*params), GFP_KERNEL); sparams = kmalloc(sizeof(*sparams), GFP_KERNEL); if (!sw_params || !params || !sparams) { err = -ENOMEM; goto failure; } if (atomic_read(&substream->mmap_count)) direct = 1; else direct = substream->oss.setup.direct; _snd_pcm_hw_params_any(sparams); _snd_pcm_hw_param_setinteger(sparams, SNDRV_PCM_HW_PARAM_PERIODS); _snd_pcm_hw_param_min(sparams, SNDRV_PCM_HW_PARAM_PERIODS, 2, 0); snd_mask_none(&mask); if (atomic_read(&substream->mmap_count)) snd_mask_set(&mask, (__force int)SNDRV_PCM_ACCESS_MMAP_INTERLEAVED); else { snd_mask_set(&mask, (__force int)SNDRV_PCM_ACCESS_RW_INTERLEAVED); if (!direct) snd_mask_set(&mask, (__force int)SNDRV_PCM_ACCESS_RW_NONINTERLEAVED); } err = snd_pcm_hw_param_mask(substream, sparams, SNDRV_PCM_HW_PARAM_ACCESS, &mask); if (err < 0) { pcm_dbg(substream->pcm, "No usable accesses\n"); err = -EINVAL; goto failure; } err = choose_rate(substream, sparams, runtime->oss.rate); if (err < 0) goto failure; err = snd_pcm_hw_param_near(substream, sparams, SNDRV_PCM_HW_PARAM_CHANNELS, runtime->oss.channels, NULL); if (err < 0) goto failure; format = snd_pcm_oss_format_from(runtime->oss.format); sformat_mask = hw_param_mask_c(sparams, SNDRV_PCM_HW_PARAM_FORMAT); if (direct) sformat = format; else sformat = snd_pcm_plug_slave_format(format, sformat_mask); if ((__force int)sformat < 0 || !snd_mask_test_format(sformat_mask, sformat)) { pcm_for_each_format(sformat) { if (snd_mask_test_format(sformat_mask, sformat) && snd_pcm_oss_format_to(sformat) >= 0) goto format_found; } pcm_dbg(substream->pcm, "Cannot find a format!!!\n"); err = -EINVAL; goto failure; } format_found: err = _snd_pcm_hw_param_set(sparams, SNDRV_PCM_HW_PARAM_FORMAT, (__force int)sformat, 0); if (err < 0) goto failure; if (direct) { memcpy(params, sparams, sizeof(*params)); } else { _snd_pcm_hw_params_any(params); _snd_pcm_hw_param_set(params, SNDRV_PCM_HW_PARAM_ACCESS, (__force int)SNDRV_PCM_ACCESS_RW_INTERLEAVED, 0); _snd_pcm_hw_param_set(params, SNDRV_PCM_HW_PARAM_FORMAT, (__force int)snd_pcm_oss_format_from(runtime->oss.format), 0); _snd_pcm_hw_param_set(params, SNDRV_PCM_HW_PARAM_CHANNELS, runtime->oss.channels, 0); _snd_pcm_hw_param_set(params, SNDRV_PCM_HW_PARAM_RATE, runtime->oss.rate, 0); pdprintf("client: access = %i, format = %i, channels = %i, rate = %i\n", params_access(params), params_format(params), params_channels(params), params_rate(params)); } pdprintf("slave: access = %i, format = %i, channels = %i, rate = %i\n", params_access(sparams), params_format(sparams), params_channels(sparams), params_rate(sparams)); oss_frame_size = snd_pcm_format_physical_width(params_format(params)) * params_channels(params) / 8; err = snd_pcm_oss_period_size(substream, params, sparams); if (err < 0) goto failure; n = snd_pcm_plug_slave_size(substream, runtime->oss.period_bytes / oss_frame_size); err = snd_pcm_hw_param_near(substream, sparams, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, n, NULL); if (err < 0) goto failure; err = snd_pcm_hw_param_near(substream, sparams, SNDRV_PCM_HW_PARAM_PERIODS, runtime->oss.periods, NULL); if (err < 0) goto failure; snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL); err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_HW_PARAMS, sparams); if (err < 0) { pcm_dbg(substream->pcm, "HW_PARAMS failed: %i\n", err); goto failure; } #ifdef CONFIG_SND_PCM_OSS_PLUGINS snd_pcm_oss_plugin_clear(substream); if (!direct) { /* add necessary plugins */ err = snd_pcm_plug_format_plugins(substream, params, sparams); if (err < 0) { pcm_dbg(substream->pcm, "snd_pcm_plug_format_plugins failed: %i\n", err); goto failure; } if (runtime->oss.plugin_first) { struct snd_pcm_plugin *plugin; err = snd_pcm_plugin_build_io(substream, sparams, &plugin); if (err < 0) { pcm_dbg(substream->pcm, "snd_pcm_plugin_build_io failed: %i\n", err); goto failure; } if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { err = snd_pcm_plugin_append(plugin); } else { err = snd_pcm_plugin_insert(plugin); } if (err < 0) goto failure; } } #endif if (runtime->oss.trigger) { sw_params->start_threshold = 1; } else { sw_params->start_threshold = runtime->boundary; } if (atomic_read(&substream->mmap_count) || substream->stream == SNDRV_PCM_STREAM_CAPTURE) sw_params->stop_threshold = runtime->boundary; else sw_params->stop_threshold = runtime->buffer_size; sw_params->tstamp_mode = SNDRV_PCM_TSTAMP_NONE; sw_params->period_step = 1; sw_params->avail_min = substream->stream == SNDRV_PCM_STREAM_PLAYBACK ? 1 : runtime->period_size; if (atomic_read(&substream->mmap_count) || substream->oss.setup.nosilence) { sw_params->silence_threshold = 0; sw_params->silence_size = 0; } else { snd_pcm_uframes_t frames; frames = runtime->period_size + 16; if (frames > runtime->buffer_size) frames = runtime->buffer_size; sw_params->silence_threshold = frames; sw_params->silence_size = frames; } err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_SW_PARAMS, sw_params); if (err < 0) { pcm_dbg(substream->pcm, "SW_PARAMS failed: %i\n", err); goto failure; } runtime->oss.periods = params_periods(sparams); oss_period_size = snd_pcm_plug_client_size(substream, params_period_size(sparams)); if (oss_period_size < 0) { err = -EINVAL; goto failure; } #ifdef CONFIG_SND_PCM_OSS_PLUGINS if (runtime->oss.plugin_first) { err = snd_pcm_plug_alloc(substream, oss_period_size); if (err < 0) goto failure; } #endif oss_period_size = array_size(oss_period_size, oss_frame_size); oss_buffer_size = array_size(oss_period_size, runtime->oss.periods); if (oss_buffer_size <= 0) { err = -EINVAL; goto failure; } runtime->oss.period_bytes = oss_period_size; runtime->oss.buffer_bytes = oss_buffer_size; pdprintf("oss: period bytes = %i, buffer bytes = %i\n", runtime->oss.period_bytes, runtime->oss.buffer_bytes); pdprintf("slave: period_size = %i, buffer_size = %i\n", params_period_size(sparams), params_buffer_size(sparams)); runtime->oss.format = snd_pcm_oss_format_to(params_format(params)); runtime->oss.channels = params_channels(params); runtime->oss.rate = params_rate(params); kvfree(runtime->oss.buffer); runtime->oss.buffer = kvzalloc(runtime->oss.period_bytes, GFP_KERNEL); if (!runtime->oss.buffer) { err = -ENOMEM; goto failure; } runtime->oss.params = 0; runtime->oss.prepare = 1; runtime->oss.buffer_used = 0; snd_pcm_runtime_buffer_set_silence(runtime); runtime->oss.period_frames = snd_pcm_alsa_frames(substream, oss_period_size); err = 0; failure: if (err) snd_pcm_oss_release_buffers(substream); kfree(sw_params); kfree(params); kfree(sparams); return err; } /* this one takes the lock by itself */ static int snd_pcm_oss_change_params(struct snd_pcm_substream *substream, bool trylock) { struct snd_pcm_runtime *runtime = substream->runtime; int err; if (trylock) { if (!(mutex_trylock(&runtime->oss.params_lock))) return -EAGAIN; } else if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -ERESTARTSYS; err = snd_pcm_oss_change_params_locked(substream); mutex_unlock(&runtime->oss.params_lock); return err; } static int snd_pcm_oss_get_active_substream(struct snd_pcm_oss_file *pcm_oss_file, struct snd_pcm_substream **r_substream) { int idx, err; struct snd_pcm_substream *asubstream = NULL, *substream; for (idx = 0; idx < 2; idx++) { substream = pcm_oss_file->streams[idx]; if (substream == NULL) continue; if (asubstream == NULL) asubstream = substream; if (substream->runtime->oss.params) { err = snd_pcm_oss_change_params(substream, false); if (err < 0) return err; } } if (!asubstream) return -EIO; if (r_substream) *r_substream = asubstream; return 0; } /* call with params_lock held */ /* NOTE: this always call PREPARE unconditionally no matter whether * runtime->oss.prepare is set or not */ static int snd_pcm_oss_prepare(struct snd_pcm_substream *substream) { int err; struct snd_pcm_runtime *runtime = substream->runtime; err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_PREPARE, NULL); if (err < 0) { pcm_dbg(substream->pcm, "snd_pcm_oss_prepare: SNDRV_PCM_IOCTL_PREPARE failed\n"); return err; } runtime->oss.prepare = 0; runtime->oss.prev_hw_ptr_period = 0; runtime->oss.period_ptr = 0; runtime->oss.buffer_used = 0; return 0; } static int snd_pcm_oss_make_ready(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime; int err; runtime = substream->runtime; if (runtime->oss.params) { err = snd_pcm_oss_change_params(substream, false); if (err < 0) return err; } if (runtime->oss.prepare) { if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -ERESTARTSYS; err = snd_pcm_oss_prepare(substream); mutex_unlock(&runtime->oss.params_lock); if (err < 0) return err; } return 0; } /* call with params_lock held */ static int snd_pcm_oss_make_ready_locked(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime; int err; runtime = substream->runtime; if (runtime->oss.params) { err = snd_pcm_oss_change_params_locked(substream); if (err < 0) return err; } if (runtime->oss.prepare) { err = snd_pcm_oss_prepare(substream); if (err < 0) return err; } return 0; } static int snd_pcm_oss_capture_position_fixup(struct snd_pcm_substream *substream, snd_pcm_sframes_t *delay) { struct snd_pcm_runtime *runtime; snd_pcm_uframes_t frames; int err = 0; while (1) { err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DELAY, delay); if (err < 0) break; runtime = substream->runtime; if (*delay <= (snd_pcm_sframes_t)runtime->buffer_size) break; /* in case of overrun, skip whole periods like OSS/Linux driver does */ /* until avail(delay) <= buffer_size */ frames = (*delay - runtime->buffer_size) + runtime->period_size - 1; frames /= runtime->period_size; frames *= runtime->period_size; err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_FORWARD, &frames); if (err < 0) break; } return err; } snd_pcm_sframes_t snd_pcm_oss_write3(struct snd_pcm_substream *substream, const char *ptr, snd_pcm_uframes_t frames, int in_kernel) { struct snd_pcm_runtime *runtime = substream->runtime; int ret; while (1) { if (runtime->state == SNDRV_PCM_STATE_XRUN || runtime->state == SNDRV_PCM_STATE_SUSPENDED) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: write: recovering from %s\n", runtime->state == SNDRV_PCM_STATE_XRUN ? "XRUN" : "SUSPEND"); #endif ret = snd_pcm_oss_prepare(substream); if (ret < 0) break; } mutex_unlock(&runtime->oss.params_lock); ret = __snd_pcm_lib_xfer(substream, (void *)ptr, true, frames, in_kernel); mutex_lock(&runtime->oss.params_lock); if (ret != -EPIPE && ret != -ESTRPIPE) break; /* test, if we can't store new data, because the stream */ /* has not been started */ if (runtime->state == SNDRV_PCM_STATE_PREPARED) return -EAGAIN; } return ret; } snd_pcm_sframes_t snd_pcm_oss_read3(struct snd_pcm_substream *substream, char *ptr, snd_pcm_uframes_t frames, int in_kernel) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_sframes_t delay; int ret; while (1) { if (runtime->state == SNDRV_PCM_STATE_XRUN || runtime->state == SNDRV_PCM_STATE_SUSPENDED) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: read: recovering from %s\n", runtime->state == SNDRV_PCM_STATE_XRUN ? "XRUN" : "SUSPEND"); #endif ret = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DRAIN, NULL); if (ret < 0) break; } else if (runtime->state == SNDRV_PCM_STATE_SETUP) { ret = snd_pcm_oss_prepare(substream); if (ret < 0) break; } ret = snd_pcm_oss_capture_position_fixup(substream, &delay); if (ret < 0) break; mutex_unlock(&runtime->oss.params_lock); ret = __snd_pcm_lib_xfer(substream, (void *)ptr, true, frames, in_kernel); mutex_lock(&runtime->oss.params_lock); if (ret == -EPIPE) { if (runtime->state == SNDRV_PCM_STATE_DRAINING) { ret = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL); if (ret < 0) break; } continue; } if (ret != -ESTRPIPE) break; } return ret; } #ifdef CONFIG_SND_PCM_OSS_PLUGINS snd_pcm_sframes_t snd_pcm_oss_writev3(struct snd_pcm_substream *substream, void **bufs, snd_pcm_uframes_t frames) { struct snd_pcm_runtime *runtime = substream->runtime; int ret; while (1) { if (runtime->state == SNDRV_PCM_STATE_XRUN || runtime->state == SNDRV_PCM_STATE_SUSPENDED) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: writev: recovering from %s\n", runtime->state == SNDRV_PCM_STATE_XRUN ? "XRUN" : "SUSPEND"); #endif ret = snd_pcm_oss_prepare(substream); if (ret < 0) break; } ret = snd_pcm_kernel_writev(substream, bufs, frames); if (ret != -EPIPE && ret != -ESTRPIPE) break; /* test, if we can't store new data, because the stream */ /* has not been started */ if (runtime->state == SNDRV_PCM_STATE_PREPARED) return -EAGAIN; } return ret; } snd_pcm_sframes_t snd_pcm_oss_readv3(struct snd_pcm_substream *substream, void **bufs, snd_pcm_uframes_t frames) { struct snd_pcm_runtime *runtime = substream->runtime; int ret; while (1) { if (runtime->state == SNDRV_PCM_STATE_XRUN || runtime->state == SNDRV_PCM_STATE_SUSPENDED) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: readv: recovering from %s\n", runtime->state == SNDRV_PCM_STATE_XRUN ? "XRUN" : "SUSPEND"); #endif ret = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DRAIN, NULL); if (ret < 0) break; } else if (runtime->state == SNDRV_PCM_STATE_SETUP) { ret = snd_pcm_oss_prepare(substream); if (ret < 0) break; } ret = snd_pcm_kernel_readv(substream, bufs, frames); if (ret != -EPIPE && ret != -ESTRPIPE) break; } return ret; } #endif /* CONFIG_SND_PCM_OSS_PLUGINS */ static ssize_t snd_pcm_oss_write2(struct snd_pcm_substream *substream, const char *buf, size_t bytes, int in_kernel) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_sframes_t frames, frames1; #ifdef CONFIG_SND_PCM_OSS_PLUGINS if (runtime->oss.plugin_first) { struct snd_pcm_plugin_channel *channels; size_t oss_frame_bytes = (runtime->oss.plugin_first->src_width * runtime->oss.plugin_first->src_format.channels) / 8; if (!in_kernel) { if (copy_from_user(runtime->oss.buffer, (const char __force __user *)buf, bytes)) return -EFAULT; buf = runtime->oss.buffer; } frames = bytes / oss_frame_bytes; frames1 = snd_pcm_plug_client_channels_buf(substream, (char *)buf, frames, &channels); if (frames1 < 0) return frames1; frames1 = snd_pcm_plug_write_transfer(substream, channels, frames1); if (frames1 <= 0) return frames1; bytes = frames1 * oss_frame_bytes; } else #endif { frames = bytes_to_frames(runtime, bytes); frames1 = snd_pcm_oss_write3(substream, buf, frames, in_kernel); if (frames1 <= 0) return frames1; bytes = frames_to_bytes(runtime, frames1); } return bytes; } static ssize_t snd_pcm_oss_write1(struct snd_pcm_substream *substream, const char __user *buf, size_t bytes) { size_t xfer = 0; ssize_t tmp = 0; struct snd_pcm_runtime *runtime = substream->runtime; if (atomic_read(&substream->mmap_count)) return -ENXIO; atomic_inc(&runtime->oss.rw_ref); while (bytes > 0) { if (mutex_lock_interruptible(&runtime->oss.params_lock)) { tmp = -ERESTARTSYS; break; } tmp = snd_pcm_oss_make_ready_locked(substream); if (tmp < 0) goto err; if (bytes < runtime->oss.period_bytes || runtime->oss.buffer_used > 0) { tmp = bytes; if (tmp + runtime->oss.buffer_used > runtime->oss.period_bytes) tmp = runtime->oss.period_bytes - runtime->oss.buffer_used; if (tmp > 0) { if (copy_from_user(runtime->oss.buffer + runtime->oss.buffer_used, buf, tmp)) { tmp = -EFAULT; goto err; } } runtime->oss.buffer_used += tmp; buf += tmp; bytes -= tmp; xfer += tmp; if (substream->oss.setup.partialfrag || runtime->oss.buffer_used == runtime->oss.period_bytes) { tmp = snd_pcm_oss_write2(substream, runtime->oss.buffer + runtime->oss.period_ptr, runtime->oss.buffer_used - runtime->oss.period_ptr, 1); if (tmp <= 0) goto err; runtime->oss.bytes += tmp; runtime->oss.period_ptr += tmp; runtime->oss.period_ptr %= runtime->oss.period_bytes; if (runtime->oss.period_ptr == 0 || runtime->oss.period_ptr == runtime->oss.buffer_used) runtime->oss.buffer_used = 0; else if ((substream->f_flags & O_NONBLOCK) != 0) { tmp = -EAGAIN; goto err; } } } else { tmp = snd_pcm_oss_write2(substream, (const char __force *)buf, runtime->oss.period_bytes, 0); if (tmp <= 0) goto err; runtime->oss.bytes += tmp; buf += tmp; bytes -= tmp; xfer += tmp; if ((substream->f_flags & O_NONBLOCK) != 0 && tmp != runtime->oss.period_bytes) tmp = -EAGAIN; } err: mutex_unlock(&runtime->oss.params_lock); if (tmp < 0) break; if (signal_pending(current)) { tmp = -ERESTARTSYS; break; } tmp = 0; } atomic_dec(&runtime->oss.rw_ref); return xfer > 0 ? (snd_pcm_sframes_t)xfer : tmp; } static ssize_t snd_pcm_oss_read2(struct snd_pcm_substream *substream, char *buf, size_t bytes, int in_kernel) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_sframes_t frames, frames1; #ifdef CONFIG_SND_PCM_OSS_PLUGINS char __user *final_dst = (char __force __user *)buf; if (runtime->oss.plugin_first) { struct snd_pcm_plugin_channel *channels; size_t oss_frame_bytes = (runtime->oss.plugin_last->dst_width * runtime->oss.plugin_last->dst_format.channels) / 8; if (!in_kernel) buf = runtime->oss.buffer; frames = bytes / oss_frame_bytes; frames1 = snd_pcm_plug_client_channels_buf(substream, buf, frames, &channels); if (frames1 < 0) return frames1; frames1 = snd_pcm_plug_read_transfer(substream, channels, frames1); if (frames1 <= 0) return frames1; bytes = frames1 * oss_frame_bytes; if (!in_kernel && copy_to_user(final_dst, buf, bytes)) return -EFAULT; } else #endif { frames = bytes_to_frames(runtime, bytes); frames1 = snd_pcm_oss_read3(substream, buf, frames, in_kernel); if (frames1 <= 0) return frames1; bytes = frames_to_bytes(runtime, frames1); } return bytes; } static ssize_t snd_pcm_oss_read1(struct snd_pcm_substream *substream, char __user *buf, size_t bytes) { size_t xfer = 0; ssize_t tmp = 0; struct snd_pcm_runtime *runtime = substream->runtime; if (atomic_read(&substream->mmap_count)) return -ENXIO; atomic_inc(&runtime->oss.rw_ref); while (bytes > 0) { if (mutex_lock_interruptible(&runtime->oss.params_lock)) { tmp = -ERESTARTSYS; break; } tmp = snd_pcm_oss_make_ready_locked(substream); if (tmp < 0) goto err; if (bytes < runtime->oss.period_bytes || runtime->oss.buffer_used > 0) { if (runtime->oss.buffer_used == 0) { tmp = snd_pcm_oss_read2(substream, runtime->oss.buffer, runtime->oss.period_bytes, 1); if (tmp <= 0) goto err; runtime->oss.bytes += tmp; runtime->oss.period_ptr = tmp; runtime->oss.buffer_used = tmp; } tmp = bytes; if ((size_t) tmp > runtime->oss.buffer_used) tmp = runtime->oss.buffer_used; if (copy_to_user(buf, runtime->oss.buffer + (runtime->oss.period_ptr - runtime->oss.buffer_used), tmp)) { tmp = -EFAULT; goto err; } buf += tmp; bytes -= tmp; xfer += tmp; runtime->oss.buffer_used -= tmp; } else { tmp = snd_pcm_oss_read2(substream, (char __force *)buf, runtime->oss.period_bytes, 0); if (tmp <= 0) goto err; runtime->oss.bytes += tmp; buf += tmp; bytes -= tmp; xfer += tmp; } err: mutex_unlock(&runtime->oss.params_lock); if (tmp < 0) break; if (signal_pending(current)) { tmp = -ERESTARTSYS; break; } tmp = 0; } atomic_dec(&runtime->oss.rw_ref); return xfer > 0 ? (snd_pcm_sframes_t)xfer : tmp; } static int snd_pcm_oss_reset(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; int i; for (i = 0; i < 2; i++) { substream = pcm_oss_file->streams[i]; if (!substream) continue; runtime = substream->runtime; snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL); mutex_lock(&runtime->oss.params_lock); runtime->oss.prepare = 1; runtime->oss.buffer_used = 0; runtime->oss.prev_hw_ptr_period = 0; runtime->oss.period_ptr = 0; mutex_unlock(&runtime->oss.params_lock); } return 0; } static int snd_pcm_oss_post(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; int err; substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; if (substream != NULL) { err = snd_pcm_oss_make_ready(substream); if (err < 0) return err; snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_START, NULL); } /* note: all errors from the start action are ignored */ /* OSS apps do not know, how to handle them */ return 0; } static int snd_pcm_oss_sync1(struct snd_pcm_substream *substream, size_t size) { struct snd_pcm_runtime *runtime; ssize_t result = 0; snd_pcm_state_t state; long res; wait_queue_entry_t wait; runtime = substream->runtime; init_waitqueue_entry(&wait, current); add_wait_queue(&runtime->sleep, &wait); #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "sync1: size = %li\n", size); #endif while (1) { result = snd_pcm_oss_write2(substream, runtime->oss.buffer, size, 1); if (result > 0) { runtime->oss.buffer_used = 0; result = 0; break; } if (result != 0 && result != -EAGAIN) break; result = 0; set_current_state(TASK_INTERRUPTIBLE); scoped_guard(pcm_stream_lock_irq, substream) state = runtime->state; if (state != SNDRV_PCM_STATE_RUNNING) { set_current_state(TASK_RUNNING); break; } res = schedule_timeout(10 * HZ); if (signal_pending(current)) { result = -ERESTARTSYS; break; } if (res == 0) { pcm_err(substream->pcm, "OSS sync error - DMA timeout\n"); result = -EIO; break; } } remove_wait_queue(&runtime->sleep, &wait); return result; } static int snd_pcm_oss_sync(struct snd_pcm_oss_file *pcm_oss_file) { int err = 0; unsigned int saved_f_flags; struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_format_t format; unsigned long width; size_t size; substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; if (substream != NULL) { runtime = substream->runtime; if (atomic_read(&substream->mmap_count)) goto __direct; atomic_inc(&runtime->oss.rw_ref); if (mutex_lock_interruptible(&runtime->oss.params_lock)) { atomic_dec(&runtime->oss.rw_ref); return -ERESTARTSYS; } err = snd_pcm_oss_make_ready_locked(substream); if (err < 0) goto unlock; format = snd_pcm_oss_format_from(runtime->oss.format); width = snd_pcm_format_physical_width(format); if (runtime->oss.buffer_used > 0) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "sync: buffer_used\n"); #endif size = (8 * (runtime->oss.period_bytes - runtime->oss.buffer_used) + 7) / width; snd_pcm_format_set_silence(format, runtime->oss.buffer + runtime->oss.buffer_used, size); err = snd_pcm_oss_sync1(substream, runtime->oss.period_bytes); if (err < 0) goto unlock; } else if (runtime->oss.period_ptr > 0) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "sync: period_ptr\n"); #endif size = runtime->oss.period_bytes - runtime->oss.period_ptr; snd_pcm_format_set_silence(format, runtime->oss.buffer, size * 8 / width); err = snd_pcm_oss_sync1(substream, size); if (err < 0) goto unlock; } /* * The ALSA's period might be a bit large than OSS one. * Fill the remain portion of ALSA period with zeros. */ size = runtime->control->appl_ptr % runtime->period_size; if (size > 0) { size = runtime->period_size - size; if (runtime->access == SNDRV_PCM_ACCESS_RW_INTERLEAVED) snd_pcm_lib_write(substream, NULL, size); else if (runtime->access == SNDRV_PCM_ACCESS_RW_NONINTERLEAVED) snd_pcm_lib_writev(substream, NULL, size); } unlock: mutex_unlock(&runtime->oss.params_lock); atomic_dec(&runtime->oss.rw_ref); if (err < 0) return err; /* * finish sync: drain the buffer */ __direct: saved_f_flags = substream->f_flags; substream->f_flags &= ~O_NONBLOCK; err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DRAIN, NULL); substream->f_flags = saved_f_flags; if (err < 0) return err; mutex_lock(&runtime->oss.params_lock); runtime->oss.prepare = 1; mutex_unlock(&runtime->oss.params_lock); } substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; if (substream != NULL) { err = snd_pcm_oss_make_ready(substream); if (err < 0) return err; runtime = substream->runtime; err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL); if (err < 0) return err; mutex_lock(&runtime->oss.params_lock); runtime->oss.buffer_used = 0; runtime->oss.prepare = 1; mutex_unlock(&runtime->oss.params_lock); } return 0; } static int snd_pcm_oss_set_rate(struct snd_pcm_oss_file *pcm_oss_file, int rate) { int idx; for (idx = 1; idx >= 0; --idx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; struct snd_pcm_runtime *runtime; int err; if (substream == NULL) continue; runtime = substream->runtime; if (rate < 1000) rate = 1000; else if (rate > 192000) rate = 192000; err = lock_params(runtime); if (err < 0) return err; if (runtime->oss.rate != rate) { runtime->oss.params = 1; runtime->oss.rate = rate; } unlock_params(runtime); } return snd_pcm_oss_get_rate(pcm_oss_file); } static int snd_pcm_oss_get_rate(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; int err; err = snd_pcm_oss_get_active_substream(pcm_oss_file, &substream); if (err < 0) return err; return substream->runtime->oss.rate; } static int snd_pcm_oss_set_channels(struct snd_pcm_oss_file *pcm_oss_file, unsigned int channels) { int idx; if (channels < 1) channels = 1; if (channels > 128) return -EINVAL; for (idx = 1; idx >= 0; --idx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; struct snd_pcm_runtime *runtime; int err; if (substream == NULL) continue; runtime = substream->runtime; err = lock_params(runtime); if (err < 0) return err; if (runtime->oss.channels != channels) { runtime->oss.params = 1; runtime->oss.channels = channels; } unlock_params(runtime); } return snd_pcm_oss_get_channels(pcm_oss_file); } static int snd_pcm_oss_get_channels(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; int err; err = snd_pcm_oss_get_active_substream(pcm_oss_file, &substream); if (err < 0) return err; return substream->runtime->oss.channels; } static int snd_pcm_oss_get_block_size(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; int err; err = snd_pcm_oss_get_active_substream(pcm_oss_file, &substream); if (err < 0) return err; return substream->runtime->oss.period_bytes; } static int snd_pcm_oss_get_formats(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; int err; int direct; struct snd_pcm_hw_params *params __free(kfree) = NULL; unsigned int formats = 0; const struct snd_mask *format_mask; int fmt; err = snd_pcm_oss_get_active_substream(pcm_oss_file, &substream); if (err < 0) return err; if (atomic_read(&substream->mmap_count)) direct = 1; else direct = substream->oss.setup.direct; if (!direct) return AFMT_MU_LAW | AFMT_U8 | AFMT_S16_LE | AFMT_S16_BE | AFMT_S8 | AFMT_U16_LE | AFMT_U16_BE | AFMT_S32_LE | AFMT_S32_BE | AFMT_S24_LE | AFMT_S24_BE | AFMT_S24_PACKED; params = kmalloc(sizeof(*params), GFP_KERNEL); if (!params) return -ENOMEM; _snd_pcm_hw_params_any(params); err = snd_pcm_hw_refine(substream, params); if (err < 0) return err; format_mask = hw_param_mask_c(params, SNDRV_PCM_HW_PARAM_FORMAT); for (fmt = 0; fmt < 32; ++fmt) { if (snd_mask_test(format_mask, fmt)) { int f = snd_pcm_oss_format_to((__force snd_pcm_format_t)fmt); if (f >= 0) formats |= f; } } return formats; } static int snd_pcm_oss_set_format(struct snd_pcm_oss_file *pcm_oss_file, int format) { int formats, idx; int err; if (format != AFMT_QUERY) { formats = snd_pcm_oss_get_formats(pcm_oss_file); if (formats < 0) return formats; if (!(formats & format)) format = AFMT_U8; for (idx = 1; idx >= 0; --idx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; struct snd_pcm_runtime *runtime; if (substream == NULL) continue; runtime = substream->runtime; err = lock_params(runtime); if (err < 0) return err; if (runtime->oss.format != format) { runtime->oss.params = 1; runtime->oss.format = format; } unlock_params(runtime); } } return snd_pcm_oss_get_format(pcm_oss_file); } static int snd_pcm_oss_get_format(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; int err; err = snd_pcm_oss_get_active_substream(pcm_oss_file, &substream); if (err < 0) return err; return substream->runtime->oss.format; } static int snd_pcm_oss_set_subdivide1(struct snd_pcm_substream *substream, int subdivide) { struct snd_pcm_runtime *runtime; runtime = substream->runtime; if (subdivide == 0) { subdivide = runtime->oss.subdivision; if (subdivide == 0) subdivide = 1; return subdivide; } if (runtime->oss.subdivision || runtime->oss.fragshift) return -EINVAL; if (subdivide != 1 && subdivide != 2 && subdivide != 4 && subdivide != 8 && subdivide != 16) return -EINVAL; runtime->oss.subdivision = subdivide; runtime->oss.params = 1; return subdivide; } static int snd_pcm_oss_set_subdivide(struct snd_pcm_oss_file *pcm_oss_file, int subdivide) { int err = -EINVAL, idx; for (idx = 1; idx >= 0; --idx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; struct snd_pcm_runtime *runtime; if (substream == NULL) continue; runtime = substream->runtime; err = lock_params(runtime); if (err < 0) return err; err = snd_pcm_oss_set_subdivide1(substream, subdivide); unlock_params(runtime); if (err < 0) return err; } return err; } static int snd_pcm_oss_set_fragment1(struct snd_pcm_substream *substream, unsigned int val) { struct snd_pcm_runtime *runtime; int fragshift; runtime = substream->runtime; if (runtime->oss.subdivision || runtime->oss.fragshift) return -EINVAL; fragshift = val & 0xffff; if (fragshift >= 25) /* should be large enough */ return -EINVAL; runtime->oss.fragshift = fragshift; runtime->oss.maxfrags = (val >> 16) & 0xffff; if (runtime->oss.fragshift < 4) /* < 16 */ runtime->oss.fragshift = 4; if (runtime->oss.maxfrags < 2) runtime->oss.maxfrags = 2; runtime->oss.params = 1; return 0; } static int snd_pcm_oss_set_fragment(struct snd_pcm_oss_file *pcm_oss_file, unsigned int val) { int err = -EINVAL, idx; for (idx = 1; idx >= 0; --idx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; struct snd_pcm_runtime *runtime; if (substream == NULL) continue; runtime = substream->runtime; err = lock_params(runtime); if (err < 0) return err; err = snd_pcm_oss_set_fragment1(substream, val); unlock_params(runtime); if (err < 0) return err; } return err; } static int snd_pcm_oss_nonblock(struct file * file) { spin_lock(&file->f_lock); file->f_flags |= O_NONBLOCK; spin_unlock(&file->f_lock); return 0; } static int snd_pcm_oss_get_caps1(struct snd_pcm_substream *substream, int res) { if (substream == NULL) { res &= ~DSP_CAP_DUPLEX; return res; } #ifdef DSP_CAP_MULTI if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) if (substream->pstr->substream_count > 1) res |= DSP_CAP_MULTI; #endif /* DSP_CAP_REALTIME is set all times: */ /* all ALSA drivers can return actual pointer in ring buffer */ #if defined(DSP_CAP_REALTIME) && 0 { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->info & (SNDRV_PCM_INFO_BLOCK_TRANSFER|SNDRV_PCM_INFO_BATCH)) res &= ~DSP_CAP_REALTIME; } #endif return res; } static int snd_pcm_oss_get_caps(struct snd_pcm_oss_file *pcm_oss_file) { int result, idx; result = DSP_CAP_TRIGGER | DSP_CAP_MMAP | DSP_CAP_DUPLEX | DSP_CAP_REALTIME; for (idx = 0; idx < 2; idx++) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; result = snd_pcm_oss_get_caps1(substream, result); } result |= 0x0001; /* revision - same as SB AWE 64 */ return result; } static void snd_pcm_oss_simulate_fill(struct snd_pcm_substream *substream, snd_pcm_uframes_t hw_ptr) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_uframes_t appl_ptr; appl_ptr = hw_ptr + runtime->buffer_size; appl_ptr %= runtime->boundary; runtime->control->appl_ptr = appl_ptr; } static int snd_pcm_oss_set_trigger(struct snd_pcm_oss_file *pcm_oss_file, int trigger) { struct snd_pcm_runtime *runtime; struct snd_pcm_substream *psubstream = NULL, *csubstream = NULL; int err, cmd; #ifdef OSS_DEBUG pr_debug("pcm_oss: trigger = 0x%x\n", trigger); #endif psubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; csubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; if (psubstream) { err = snd_pcm_oss_make_ready(psubstream); if (err < 0) return err; } if (csubstream) { err = snd_pcm_oss_make_ready(csubstream); if (err < 0) return err; } if (psubstream) { runtime = psubstream->runtime; cmd = 0; if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -ERESTARTSYS; if (trigger & PCM_ENABLE_OUTPUT) { if (runtime->oss.trigger) goto _skip1; if (atomic_read(&psubstream->mmap_count)) snd_pcm_oss_simulate_fill(psubstream, get_hw_ptr_period(runtime)); runtime->oss.trigger = 1; runtime->start_threshold = 1; cmd = SNDRV_PCM_IOCTL_START; } else { if (!runtime->oss.trigger) goto _skip1; runtime->oss.trigger = 0; runtime->start_threshold = runtime->boundary; cmd = SNDRV_PCM_IOCTL_DROP; runtime->oss.prepare = 1; } _skip1: mutex_unlock(&runtime->oss.params_lock); if (cmd) { err = snd_pcm_kernel_ioctl(psubstream, cmd, NULL); if (err < 0) return err; } } if (csubstream) { runtime = csubstream->runtime; cmd = 0; if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -ERESTARTSYS; if (trigger & PCM_ENABLE_INPUT) { if (runtime->oss.trigger) goto _skip2; runtime->oss.trigger = 1; runtime->start_threshold = 1; cmd = SNDRV_PCM_IOCTL_START; } else { if (!runtime->oss.trigger) goto _skip2; runtime->oss.trigger = 0; runtime->start_threshold = runtime->boundary; cmd = SNDRV_PCM_IOCTL_DROP; runtime->oss.prepare = 1; } _skip2: mutex_unlock(&runtime->oss.params_lock); if (cmd) { err = snd_pcm_kernel_ioctl(csubstream, cmd, NULL); if (err < 0) return err; } } return 0; } static int snd_pcm_oss_get_trigger(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *psubstream = NULL, *csubstream = NULL; int result = 0; psubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; csubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; if (psubstream && psubstream->runtime && psubstream->runtime->oss.trigger) result |= PCM_ENABLE_OUTPUT; if (csubstream && csubstream->runtime && csubstream->runtime->oss.trigger) result |= PCM_ENABLE_INPUT; return result; } static int snd_pcm_oss_get_odelay(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_sframes_t delay; int err; substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; if (substream == NULL) return -EINVAL; err = snd_pcm_oss_make_ready(substream); if (err < 0) return err; runtime = substream->runtime; if (runtime->oss.params || runtime->oss.prepare) return 0; err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DELAY, &delay); if (err == -EPIPE) delay = 0; /* hack for broken OSS applications */ else if (err < 0) return err; return snd_pcm_oss_bytes(substream, delay); } static int snd_pcm_oss_get_ptr(struct snd_pcm_oss_file *pcm_oss_file, int stream, struct count_info __user * _info) { struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_sframes_t delay; int fixup; struct count_info info; int err; if (_info == NULL) return -EFAULT; substream = pcm_oss_file->streams[stream]; if (substream == NULL) return -EINVAL; err = snd_pcm_oss_make_ready(substream); if (err < 0) return err; runtime = substream->runtime; if (runtime->oss.params || runtime->oss.prepare) { memset(&info, 0, sizeof(info)); if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return 0; } if (stream == SNDRV_PCM_STREAM_PLAYBACK) { err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DELAY, &delay); if (err == -EPIPE || err == -ESTRPIPE || (! err && delay < 0)) { err = 0; delay = 0; fixup = 0; } else { fixup = runtime->oss.buffer_used; } } else { err = snd_pcm_oss_capture_position_fixup(substream, &delay); fixup = -runtime->oss.buffer_used; } if (err < 0) return err; info.ptr = snd_pcm_oss_bytes(substream, runtime->status->hw_ptr % runtime->buffer_size); if (atomic_read(&substream->mmap_count)) { snd_pcm_sframes_t n; delay = get_hw_ptr_period(runtime); n = delay - runtime->oss.prev_hw_ptr_period; if (n < 0) n += runtime->boundary; info.blocks = n / runtime->period_size; runtime->oss.prev_hw_ptr_period = delay; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) snd_pcm_oss_simulate_fill(substream, delay); info.bytes = snd_pcm_oss_bytes(substream, runtime->status->hw_ptr) & INT_MAX; } else { delay = snd_pcm_oss_bytes(substream, delay); if (stream == SNDRV_PCM_STREAM_PLAYBACK) { if (substream->oss.setup.buggyptr) info.blocks = (runtime->oss.buffer_bytes - delay - fixup) / runtime->oss.period_bytes; else info.blocks = (delay + fixup) / runtime->oss.period_bytes; info.bytes = (runtime->oss.bytes - delay) & INT_MAX; } else { delay += fixup; info.blocks = delay / runtime->oss.period_bytes; info.bytes = (runtime->oss.bytes + delay) & INT_MAX; } } if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return 0; } static int snd_pcm_oss_get_space(struct snd_pcm_oss_file *pcm_oss_file, int stream, struct audio_buf_info __user *_info) { struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_sframes_t avail; int fixup; struct audio_buf_info info; int err; if (_info == NULL) return -EFAULT; substream = pcm_oss_file->streams[stream]; if (substream == NULL) return -EINVAL; runtime = substream->runtime; if (runtime->oss.params) { err = snd_pcm_oss_change_params(substream, false); if (err < 0) return err; } info.fragsize = runtime->oss.period_bytes; info.fragstotal = runtime->periods; if (runtime->oss.prepare) { if (stream == SNDRV_PCM_STREAM_PLAYBACK) { info.bytes = runtime->oss.period_bytes * runtime->oss.periods; info.fragments = runtime->oss.periods; } else { info.bytes = 0; info.fragments = 0; } } else { if (stream == SNDRV_PCM_STREAM_PLAYBACK) { err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DELAY, &avail); if (err == -EPIPE || err == -ESTRPIPE || (! err && avail < 0)) { avail = runtime->buffer_size; err = 0; fixup = 0; } else { avail = runtime->buffer_size - avail; fixup = -runtime->oss.buffer_used; } } else { err = snd_pcm_oss_capture_position_fixup(substream, &avail); fixup = runtime->oss.buffer_used; } if (err < 0) return err; info.bytes = snd_pcm_oss_bytes(substream, avail) + fixup; info.fragments = info.bytes / runtime->oss.period_bytes; } #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: space: bytes = %i, fragments = %i, fragstotal = %i, fragsize = %i\n", info.bytes, info.fragments, info.fragstotal, info.fragsize); #endif if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return 0; } static int snd_pcm_oss_get_mapbuf(struct snd_pcm_oss_file *pcm_oss_file, int stream, struct buffmem_desc __user * _info) { // it won't be probably implemented // pr_debug("TODO: snd_pcm_oss_get_mapbuf\n"); return -EINVAL; } static const char *strip_task_path(const char *path) { const char *ptr, *ptrl = NULL; for (ptr = path; *ptr; ptr++) { if (*ptr == '/') ptrl = ptr + 1; } return ptrl; } static void snd_pcm_oss_look_for_setup(struct snd_pcm *pcm, int stream, const char *task_name, struct snd_pcm_oss_setup *rsetup) { struct snd_pcm_oss_setup *setup; guard(mutex)(&pcm->streams[stream].oss.setup_mutex); do { for (setup = pcm->streams[stream].oss.setup_list; setup; setup = setup->next) { if (!strcmp(setup->task_name, task_name)) goto out; } } while ((task_name = strip_task_path(task_name)) != NULL); out: if (setup) *rsetup = *setup; } static void snd_pcm_oss_release_substream(struct snd_pcm_substream *substream) { snd_pcm_oss_release_buffers(substream); substream->oss.oss = 0; } static void snd_pcm_oss_init_substream(struct snd_pcm_substream *substream, struct snd_pcm_oss_setup *setup, int minor) { struct snd_pcm_runtime *runtime; substream->oss.oss = 1; substream->oss.setup = *setup; if (setup->nonblock) substream->f_flags |= O_NONBLOCK; else if (setup->block) substream->f_flags &= ~O_NONBLOCK; runtime = substream->runtime; runtime->oss.params = 1; runtime->oss.trigger = 1; runtime->oss.rate = 8000; mutex_init(&runtime->oss.params_lock); switch (SNDRV_MINOR_OSS_DEVICE(minor)) { case SNDRV_MINOR_OSS_PCM_8: runtime->oss.format = AFMT_U8; break; case SNDRV_MINOR_OSS_PCM_16: runtime->oss.format = AFMT_S16_LE; break; default: runtime->oss.format = AFMT_MU_LAW; } runtime->oss.channels = 1; runtime->oss.fragshift = 0; runtime->oss.maxfrags = 0; runtime->oss.subdivision = 0; substream->pcm_release = snd_pcm_oss_release_substream; atomic_set(&runtime->oss.rw_ref, 0); } static int snd_pcm_oss_release_file(struct snd_pcm_oss_file *pcm_oss_file) { int cidx; if (!pcm_oss_file) return 0; for (cidx = 0; cidx < 2; ++cidx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[cidx]; if (substream) snd_pcm_release_substream(substream); } kfree(pcm_oss_file); return 0; } static int snd_pcm_oss_open_file(struct file *file, struct snd_pcm *pcm, struct snd_pcm_oss_file **rpcm_oss_file, int minor, struct snd_pcm_oss_setup *setup) { int idx, err; struct snd_pcm_oss_file *pcm_oss_file; struct snd_pcm_substream *substream; fmode_t f_mode = file->f_mode; if (rpcm_oss_file) *rpcm_oss_file = NULL; pcm_oss_file = kzalloc(sizeof(*pcm_oss_file), GFP_KERNEL); if (pcm_oss_file == NULL) return -ENOMEM; if ((f_mode & (FMODE_WRITE|FMODE_READ)) == (FMODE_WRITE|FMODE_READ) && (pcm->info_flags & SNDRV_PCM_INFO_HALF_DUPLEX)) f_mode = FMODE_WRITE; file->f_flags &= ~O_APPEND; for (idx = 0; idx < 2; idx++) { if (setup[idx].disable) continue; if (! pcm->streams[idx].substream_count) continue; /* no matching substream */ if (idx == SNDRV_PCM_STREAM_PLAYBACK) { if (! (f_mode & FMODE_WRITE)) continue; } else { if (! (f_mode & FMODE_READ)) continue; } err = snd_pcm_open_substream(pcm, idx, file, &substream); if (err < 0) { snd_pcm_oss_release_file(pcm_oss_file); return err; } pcm_oss_file->streams[idx] = substream; snd_pcm_oss_init_substream(substream, &setup[idx], minor); } if (!pcm_oss_file->streams[0] && !pcm_oss_file->streams[1]) { snd_pcm_oss_release_file(pcm_oss_file); return -EINVAL; } file->private_data = pcm_oss_file; if (rpcm_oss_file) *rpcm_oss_file = pcm_oss_file; return 0; } static int snd_task_name(struct task_struct *task, char *name, size_t size) { unsigned int idx; if (snd_BUG_ON(!task || !name || size < 2)) return -EINVAL; for (idx = 0; idx < sizeof(task->comm) && idx + 1 < size; idx++) name[idx] = task->comm[idx]; name[idx] = '\0'; return 0; } static int snd_pcm_oss_open(struct inode *inode, struct file *file) { int err; char task_name[32]; struct snd_pcm *pcm; struct snd_pcm_oss_file *pcm_oss_file; struct snd_pcm_oss_setup setup[2]; int nonblock; wait_queue_entry_t wait; err = nonseekable_open(inode, file); if (err < 0) return err; pcm = snd_lookup_oss_minor_data(iminor(inode), SNDRV_OSS_DEVICE_TYPE_PCM); if (pcm == NULL) { err = -ENODEV; goto __error1; } err = snd_card_file_add(pcm->card, file); if (err < 0) goto __error1; if (!try_module_get(pcm->card->module)) { err = -EFAULT; goto __error2; } if (snd_task_name(current, task_name, sizeof(task_name)) < 0) { err = -EFAULT; goto __error; } memset(setup, 0, sizeof(setup)); if (file->f_mode & FMODE_WRITE) snd_pcm_oss_look_for_setup(pcm, SNDRV_PCM_STREAM_PLAYBACK, task_name, &setup[0]); if (file->f_mode & FMODE_READ) snd_pcm_oss_look_for_setup(pcm, SNDRV_PCM_STREAM_CAPTURE, task_name, &setup[1]); nonblock = !!(file->f_flags & O_NONBLOCK); if (!nonblock) nonblock = nonblock_open; init_waitqueue_entry(&wait, current); add_wait_queue(&pcm->open_wait, &wait); mutex_lock(&pcm->open_mutex); while (1) { err = snd_pcm_oss_open_file(file, pcm, &pcm_oss_file, iminor(inode), setup); if (err >= 0) break; if (err == -EAGAIN) { if (nonblock) { err = -EBUSY; break; } } else break; set_current_state(TASK_INTERRUPTIBLE); mutex_unlock(&pcm->open_mutex); schedule(); mutex_lock(&pcm->open_mutex); if (pcm->card->shutdown) { err = -ENODEV; break; } if (signal_pending(current)) { err = -ERESTARTSYS; break; } } remove_wait_queue(&pcm->open_wait, &wait); mutex_unlock(&pcm->open_mutex); if (err < 0) goto __error; snd_card_unref(pcm->card); return err; __error: module_put(pcm->card->module); __error2: snd_card_file_remove(pcm->card, file); __error1: if (pcm) snd_card_unref(pcm->card); return err; } static int snd_pcm_oss_release(struct inode *inode, struct file *file) { struct snd_pcm *pcm; struct snd_pcm_substream *substream; struct snd_pcm_oss_file *pcm_oss_file; pcm_oss_file = file->private_data; substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; if (substream == NULL) substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; if (snd_BUG_ON(!substream)) return -ENXIO; pcm = substream->pcm; if (!pcm->card->shutdown) snd_pcm_oss_sync(pcm_oss_file); mutex_lock(&pcm->open_mutex); snd_pcm_oss_release_file(pcm_oss_file); mutex_unlock(&pcm->open_mutex); wake_up(&pcm->open_wait); module_put(pcm->card->module); snd_card_file_remove(pcm->card, file); return 0; } static long snd_pcm_oss_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct snd_pcm_oss_file *pcm_oss_file; int __user *p = (int __user *)arg; int res; pcm_oss_file = file->private_data; if (cmd == OSS_GETVERSION) return put_user(SNDRV_OSS_VERSION, p); if (cmd == OSS_ALSAEMULVER) return put_user(1, p); #if IS_REACHABLE(CONFIG_SND_MIXER_OSS) if (((cmd >> 8) & 0xff) == 'M') { /* mixer ioctl - for OSS compatibility */ struct snd_pcm_substream *substream; int idx; for (idx = 0; idx < 2; ++idx) { substream = pcm_oss_file->streams[idx]; if (substream != NULL) break; } if (snd_BUG_ON(idx >= 2)) return -ENXIO; return snd_mixer_oss_ioctl_card(substream->pcm->card, cmd, arg); } #endif if (((cmd >> 8) & 0xff) != 'P') return -EINVAL; #ifdef OSS_DEBUG pr_debug("pcm_oss: ioctl = 0x%x\n", cmd); #endif switch (cmd) { case SNDCTL_DSP_RESET: return snd_pcm_oss_reset(pcm_oss_file); case SNDCTL_DSP_SYNC: return snd_pcm_oss_sync(pcm_oss_file); case SNDCTL_DSP_SPEED: if (get_user(res, p)) return -EFAULT; res = snd_pcm_oss_set_rate(pcm_oss_file, res); if (res < 0) return res; return put_user(res, p); case SOUND_PCM_READ_RATE: res = snd_pcm_oss_get_rate(pcm_oss_file); if (res < 0) return res; return put_user(res, p); case SNDCTL_DSP_STEREO: if (get_user(res, p)) return -EFAULT; res = res > 0 ? 2 : 1; res = snd_pcm_oss_set_channels(pcm_oss_file, res); if (res < 0) return res; return put_user(--res, p); case SNDCTL_DSP_GETBLKSIZE: res = snd_pcm_oss_get_block_size(pcm_oss_file); if (res < 0) return res; return put_user(res, p); case SNDCTL_DSP_SETFMT: if (get_user(res, p)) return -EFAULT; res = snd_pcm_oss_set_format(pcm_oss_file, res); if (res < 0) return res; return put_user(res, p); case SOUND_PCM_READ_BITS: res = snd_pcm_oss_get_format(pcm_oss_file); if (res < 0) return res; return put_user(res, p); case SNDCTL_DSP_CHANNELS: if (get_user(res, p)) return -EFAULT; res = snd_pcm_oss_set_channels(pcm_oss_file, res); if (res < 0) return res; return put_user(res, p); case SOUND_PCM_READ_CHANNELS: res = snd_pcm_oss_get_channels(pcm_oss_file); if (res < 0) return res; return put_user(res, p); case SOUND_PCM_WRITE_FILTER: case SOUND_PCM_READ_FILTER: return -EIO; case SNDCTL_DSP_POST: return snd_pcm_oss_post(pcm_oss_file); case SNDCTL_DSP_SUBDIVIDE: if (get_user(res, p)) return -EFAULT; res = snd_pcm_oss_set_subdivide(pcm_oss_file, res); if (res < 0) return res; return put_user(res, p); case SNDCTL_DSP_SETFRAGMENT: if (get_user(res, p)) return -EFAULT; return snd_pcm_oss_set_fragment(pcm_oss_file, res); case SNDCTL_DSP_GETFMTS: res = snd_pcm_oss_get_formats(pcm_oss_file); if (res < 0) return res; return put_user(res, p); case SNDCTL_DSP_GETOSPACE: case SNDCTL_DSP_GETISPACE: return snd_pcm_oss_get_space(pcm_oss_file, cmd == SNDCTL_DSP_GETISPACE ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK, (struct audio_buf_info __user *) arg); case SNDCTL_DSP_NONBLOCK: return snd_pcm_oss_nonblock(file); case SNDCTL_DSP_GETCAPS: res = snd_pcm_oss_get_caps(pcm_oss_file); if (res < 0) return res; return put_user(res, p); case SNDCTL_DSP_GETTRIGGER: res = snd_pcm_oss_get_trigger(pcm_oss_file); if (res < 0) return res; return put_user(res, p); case SNDCTL_DSP_SETTRIGGER: if (get_user(res, p)) return -EFAULT; return snd_pcm_oss_set_trigger(pcm_oss_file, res); case SNDCTL_DSP_GETIPTR: case SNDCTL_DSP_GETOPTR: return snd_pcm_oss_get_ptr(pcm_oss_file, cmd == SNDCTL_DSP_GETIPTR ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK, (struct count_info __user *) arg); case SNDCTL_DSP_MAPINBUF: case SNDCTL_DSP_MAPOUTBUF: return snd_pcm_oss_get_mapbuf(pcm_oss_file, cmd == SNDCTL_DSP_MAPINBUF ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK, (struct buffmem_desc __user *) arg); case SNDCTL_DSP_SETSYNCRO: /* stop DMA now.. */ return 0; case SNDCTL_DSP_SETDUPLEX: if (snd_pcm_oss_get_caps(pcm_oss_file) & DSP_CAP_DUPLEX) return 0; return -EIO; case SNDCTL_DSP_GETODELAY: res = snd_pcm_oss_get_odelay(pcm_oss_file); if (res < 0) { /* it's for sure, some broken apps don't check for error codes */ put_user(0, p); return res; } return put_user(res, p); case SNDCTL_DSP_PROFILE: return 0; /* silently ignore */ default: pr_debug("pcm_oss: unknown command = 0x%x\n", cmd); } return -EINVAL; } #ifdef CONFIG_COMPAT /* all compatible */ static long snd_pcm_oss_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { /* * Everything is compatbile except SNDCTL_DSP_MAPINBUF/SNDCTL_DSP_MAPOUTBUF, * which are not implemented for the native case either */ return snd_pcm_oss_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); } #else #define snd_pcm_oss_ioctl_compat NULL #endif static ssize_t snd_pcm_oss_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { struct snd_pcm_oss_file *pcm_oss_file; struct snd_pcm_substream *substream; pcm_oss_file = file->private_data; substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; if (substream == NULL) return -ENXIO; substream->f_flags = file->f_flags & O_NONBLOCK; #ifndef OSS_DEBUG return snd_pcm_oss_read1(substream, buf, count); #else { ssize_t res = snd_pcm_oss_read1(substream, buf, count); pcm_dbg(substream->pcm, "pcm_oss: read %li bytes (returned %li bytes)\n", (long)count, (long)res); return res; } #endif } static ssize_t snd_pcm_oss_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct snd_pcm_oss_file *pcm_oss_file; struct snd_pcm_substream *substream; long result; pcm_oss_file = file->private_data; substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; if (substream == NULL) return -ENXIO; substream->f_flags = file->f_flags & O_NONBLOCK; result = snd_pcm_oss_write1(substream, buf, count); #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: write %li bytes (wrote %li bytes)\n", (long)count, (long)result); #endif return result; } static int snd_pcm_oss_playback_ready(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; if (atomic_read(&substream->mmap_count)) return runtime->oss.prev_hw_ptr_period != get_hw_ptr_period(runtime); else return snd_pcm_playback_avail(runtime) >= runtime->oss.period_frames; } static int snd_pcm_oss_capture_ready(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; if (atomic_read(&substream->mmap_count)) return runtime->oss.prev_hw_ptr_period != get_hw_ptr_period(runtime); else return snd_pcm_capture_avail(runtime) >= runtime->oss.period_frames; } static __poll_t snd_pcm_oss_poll(struct file *file, poll_table * wait) { struct snd_pcm_oss_file *pcm_oss_file; __poll_t mask; struct snd_pcm_substream *psubstream = NULL, *csubstream = NULL; pcm_oss_file = file->private_data; psubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; csubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; mask = 0; if (psubstream != NULL) { struct snd_pcm_runtime *runtime = psubstream->runtime; poll_wait(file, &runtime->sleep, wait); scoped_guard(pcm_stream_lock_irq, psubstream) { if (runtime->state != SNDRV_PCM_STATE_DRAINING && (runtime->state != SNDRV_PCM_STATE_RUNNING || snd_pcm_oss_playback_ready(psubstream))) mask |= EPOLLOUT | EPOLLWRNORM; } } if (csubstream != NULL) { struct snd_pcm_runtime *runtime = csubstream->runtime; snd_pcm_state_t ostate; poll_wait(file, &runtime->sleep, wait); scoped_guard(pcm_stream_lock_irq, csubstream) { ostate = runtime->state; if (ostate != SNDRV_PCM_STATE_RUNNING || snd_pcm_oss_capture_ready(csubstream)) mask |= EPOLLIN | EPOLLRDNORM; } if (ostate != SNDRV_PCM_STATE_RUNNING && runtime->oss.trigger) { struct snd_pcm_oss_file ofile; memset(&ofile, 0, sizeof(ofile)); ofile.streams[SNDRV_PCM_STREAM_CAPTURE] = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; runtime->oss.trigger = 0; snd_pcm_oss_set_trigger(&ofile, PCM_ENABLE_INPUT); } } return mask; } static int snd_pcm_oss_mmap(struct file *file, struct vm_area_struct *area) { struct snd_pcm_oss_file *pcm_oss_file; struct snd_pcm_substream *substream = NULL; struct snd_pcm_runtime *runtime; int err; #ifdef OSS_DEBUG pr_debug("pcm_oss: mmap begin\n"); #endif pcm_oss_file = file->private_data; switch ((area->vm_flags & (VM_READ | VM_WRITE))) { case VM_READ | VM_WRITE: substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; if (substream) break; fallthrough; case VM_READ: substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; break; case VM_WRITE: substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; break; default: return -EINVAL; } /* set VM_READ access as well to fix memset() routines that do reads before writes (to improve performance) */ vm_flags_set(area, VM_READ); if (substream == NULL) return -ENXIO; runtime = substream->runtime; if (!(runtime->info & SNDRV_PCM_INFO_MMAP_VALID)) return -EIO; if (runtime->info & SNDRV_PCM_INFO_INTERLEAVED) runtime->access = SNDRV_PCM_ACCESS_MMAP_INTERLEAVED; else return -EIO; if (runtime->oss.params) { /* use mutex_trylock() for params_lock for avoiding a deadlock * between mmap_lock and params_lock taken by * copy_from/to_user() in snd_pcm_oss_write/read() */ err = snd_pcm_oss_change_params(substream, true); if (err < 0) return err; } #ifdef CONFIG_SND_PCM_OSS_PLUGINS if (runtime->oss.plugin_first != NULL) return -EIO; #endif if (area->vm_pgoff != 0) return -EINVAL; err = snd_pcm_mmap_data(substream, file, area); if (err < 0) return err; runtime->oss.mmap_bytes = area->vm_end - area->vm_start; runtime->silence_threshold = 0; runtime->silence_size = 0; #ifdef OSS_DEBUG pr_debug("pcm_oss: mmap ok, bytes = 0x%x\n", runtime->oss.mmap_bytes); #endif /* In mmap mode we never stop */ runtime->stop_threshold = runtime->boundary; return 0; } #ifdef CONFIG_SND_VERBOSE_PROCFS /* * /proc interface */ static void snd_pcm_oss_proc_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_pcm_str *pstr = entry->private_data; struct snd_pcm_oss_setup *setup = pstr->oss.setup_list; guard(mutex)(&pstr->oss.setup_mutex); while (setup) { snd_iprintf(buffer, "%s %u %u%s%s%s%s%s%s\n", setup->task_name, setup->periods, setup->period_size, setup->disable ? " disable" : "", setup->direct ? " direct" : "", setup->block ? " block" : "", setup->nonblock ? " non-block" : "", setup->partialfrag ? " partial-frag" : "", setup->nosilence ? " no-silence" : ""); setup = setup->next; } } static void snd_pcm_oss_proc_free_setup_list(struct snd_pcm_str * pstr) { struct snd_pcm_oss_setup *setup, *setupn; for (setup = pstr->oss.setup_list, pstr->oss.setup_list = NULL; setup; setup = setupn) { setupn = setup->next; kfree(setup->task_name); kfree(setup); } pstr->oss.setup_list = NULL; } static void snd_pcm_oss_proc_write(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_pcm_str *pstr = entry->private_data; char line[128], str[32], task_name[32]; const char *ptr; int idx1; struct snd_pcm_oss_setup *setup, *setup1, template; while (!snd_info_get_line(buffer, line, sizeof(line))) { guard(mutex)(&pstr->oss.setup_mutex); memset(&template, 0, sizeof(template)); ptr = snd_info_get_str(task_name, line, sizeof(task_name)); if (!strcmp(task_name, "clear") || !strcmp(task_name, "erase")) { snd_pcm_oss_proc_free_setup_list(pstr); continue; } for (setup = pstr->oss.setup_list; setup; setup = setup->next) { if (!strcmp(setup->task_name, task_name)) { template = *setup; break; } } ptr = snd_info_get_str(str, ptr, sizeof(str)); template.periods = simple_strtoul(str, NULL, 10); ptr = snd_info_get_str(str, ptr, sizeof(str)); template.period_size = simple_strtoul(str, NULL, 10); for (idx1 = 31; idx1 >= 0; idx1--) if (template.period_size & (1 << idx1)) break; for (idx1--; idx1 >= 0; idx1--) template.period_size &= ~(1 << idx1); do { ptr = snd_info_get_str(str, ptr, sizeof(str)); if (!strcmp(str, "disable")) { template.disable = 1; } else if (!strcmp(str, "direct")) { template.direct = 1; } else if (!strcmp(str, "block")) { template.block = 1; } else if (!strcmp(str, "non-block")) { template.nonblock = 1; } else if (!strcmp(str, "partial-frag")) { template.partialfrag = 1; } else if (!strcmp(str, "no-silence")) { template.nosilence = 1; } else if (!strcmp(str, "buggy-ptr")) { template.buggyptr = 1; } } while (*str); if (setup == NULL) { setup = kmalloc(sizeof(*setup), GFP_KERNEL); if (! setup) { buffer->error = -ENOMEM; return; } if (pstr->oss.setup_list == NULL) pstr->oss.setup_list = setup; else { for (setup1 = pstr->oss.setup_list; setup1->next; setup1 = setup1->next); setup1->next = setup; } template.task_name = kstrdup(task_name, GFP_KERNEL); if (! template.task_name) { kfree(setup); buffer->error = -ENOMEM; return; } } *setup = template; } } static void snd_pcm_oss_proc_init(struct snd_pcm *pcm) { int stream; for (stream = 0; stream < 2; ++stream) { struct snd_info_entry *entry; struct snd_pcm_str *pstr = &pcm->streams[stream]; if (pstr->substream_count == 0) continue; entry = snd_info_create_card_entry(pcm->card, "oss", pstr->proc_root); if (entry) { entry->content = SNDRV_INFO_CONTENT_TEXT; entry->mode = S_IFREG | 0644; entry->c.text.read = snd_pcm_oss_proc_read; entry->c.text.write = snd_pcm_oss_proc_write; entry->private_data = pstr; if (snd_info_register(entry) < 0) { snd_info_free_entry(entry); entry = NULL; } } pstr->oss.proc_entry = entry; } } static void snd_pcm_oss_proc_done(struct snd_pcm *pcm) { int stream; for (stream = 0; stream < 2; ++stream) { struct snd_pcm_str *pstr = &pcm->streams[stream]; snd_info_free_entry(pstr->oss.proc_entry); pstr->oss.proc_entry = NULL; snd_pcm_oss_proc_free_setup_list(pstr); } } #else /* !CONFIG_SND_VERBOSE_PROCFS */ static inline void snd_pcm_oss_proc_init(struct snd_pcm *pcm) { } static inline void snd_pcm_oss_proc_done(struct snd_pcm *pcm) { } #endif /* CONFIG_SND_VERBOSE_PROCFS */ /* * ENTRY functions */ static const struct file_operations snd_pcm_oss_f_reg = { .owner = THIS_MODULE, .read = snd_pcm_oss_read, .write = snd_pcm_oss_write, .open = snd_pcm_oss_open, .release = snd_pcm_oss_release, .poll = snd_pcm_oss_poll, .unlocked_ioctl = snd_pcm_oss_ioctl, .compat_ioctl = snd_pcm_oss_ioctl_compat, .mmap = snd_pcm_oss_mmap, }; static void register_oss_dsp(struct snd_pcm *pcm, int index) { if (snd_register_oss_device(SNDRV_OSS_DEVICE_TYPE_PCM, pcm->card, index, &snd_pcm_oss_f_reg, pcm) < 0) { pcm_err(pcm, "unable to register OSS PCM device %i:%i\n", pcm->card->number, pcm->device); } } static int snd_pcm_oss_register_minor(struct snd_pcm *pcm) { pcm->oss.reg = 0; if (dsp_map[pcm->card->number] == (int)pcm->device) { char name[128]; int duplex; register_oss_dsp(pcm, 0); duplex = (pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream_count > 0 && pcm->streams[SNDRV_PCM_STREAM_CAPTURE].substream_count && !(pcm->info_flags & SNDRV_PCM_INFO_HALF_DUPLEX)); sprintf(name, "%s%s", pcm->name, duplex ? " (DUPLEX)" : ""); #ifdef SNDRV_OSS_INFO_DEV_AUDIO snd_oss_info_register(SNDRV_OSS_INFO_DEV_AUDIO, pcm->card->number, name); #endif pcm->oss.reg++; pcm->oss.reg_mask |= 1; } if (adsp_map[pcm->card->number] == (int)pcm->device) { register_oss_dsp(pcm, 1); pcm->oss.reg++; pcm->oss.reg_mask |= 2; } if (pcm->oss.reg) snd_pcm_oss_proc_init(pcm); return 0; } static int snd_pcm_oss_disconnect_minor(struct snd_pcm *pcm) { if (pcm->oss.reg) { if (pcm->oss.reg_mask & 1) { pcm->oss.reg_mask &= ~1; snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_PCM, pcm->card, 0); } if (pcm->oss.reg_mask & 2) { pcm->oss.reg_mask &= ~2; snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_PCM, pcm->card, 1); } if (dsp_map[pcm->card->number] == (int)pcm->device) { #ifdef SNDRV_OSS_INFO_DEV_AUDIO snd_oss_info_unregister(SNDRV_OSS_INFO_DEV_AUDIO, pcm->card->number); #endif } pcm->oss.reg = 0; } return 0; } static int snd_pcm_oss_unregister_minor(struct snd_pcm *pcm) { snd_pcm_oss_disconnect_minor(pcm); snd_pcm_oss_proc_done(pcm); return 0; } static struct snd_pcm_notify snd_pcm_oss_notify = { .n_register = snd_pcm_oss_register_minor, .n_disconnect = snd_pcm_oss_disconnect_minor, .n_unregister = snd_pcm_oss_unregister_minor, }; static int __init alsa_pcm_oss_init(void) { int i; int err; /* check device map table */ for (i = 0; i < SNDRV_CARDS; i++) { if (dsp_map[i] < 0 || dsp_map[i] >= SNDRV_PCM_DEVICES) { pr_err("ALSA: pcm_oss: invalid dsp_map[%d] = %d\n", i, dsp_map[i]); dsp_map[i] = 0; } if (adsp_map[i] < 0 || adsp_map[i] >= SNDRV_PCM_DEVICES) { pr_err("ALSA: pcm_oss: invalid adsp_map[%d] = %d\n", i, adsp_map[i]); adsp_map[i] = 1; } } err = snd_pcm_notify(&snd_pcm_oss_notify, 0); if (err < 0) return err; return 0; } static void __exit alsa_pcm_oss_exit(void) { snd_pcm_notify(&snd_pcm_oss_notify, 1); } module_init(alsa_pcm_oss_init) module_exit(alsa_pcm_oss_exit)
3 2 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 4 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 // SPDX-License-Identifier: GPL-2.0-only /* * hid-ft260.c - FTDI FT260 USB HID to I2C host bridge * * Copyright (c) 2021, Michael Zaidman <michaelz@xsightlabs.com> * * Data Sheet: * https://www.ftdichip.com/Support/Documents/DataSheets/ICs/DS_FT260.pdf */ #include "hid-ids.h" #include <linux/hidraw.h> #include <linux/i2c.h> #include <linux/module.h> #include <linux/usb.h> #ifdef DEBUG static int ft260_debug = 1; #else static int ft260_debug; #endif module_param_named(debug, ft260_debug, int, 0600); MODULE_PARM_DESC(debug, "Toggle FT260 debugging messages"); #define ft260_dbg(format, arg...) \ do { \ if (ft260_debug) \ pr_info("%s: " format, __func__, ##arg); \ } while (0) #define FT260_REPORT_MAX_LENGTH (64) #define FT260_I2C_DATA_REPORT_ID(len) (FT260_I2C_REPORT_MIN + (len - 1) / 4) #define FT260_WAKEUP_NEEDED_AFTER_MS (4800) /* 5s minus 200ms margin */ /* * The ft260 input report format defines 62 bytes for the data payload, but * when requested 62 bytes, the controller returns 60 and 2 in separate input * reports. To achieve better performance with the multi-report read data * transfers, we set the maximum read payload length to a multiple of 60. * With a 100 kHz I2C clock, one 240 bytes read takes about 1/27 second, * which is excessive; On the other hand, some higher layer drivers like at24 * or optoe limit the i2c reads to 128 bytes. To not block other drivers out * of I2C for potentially troublesome amounts of time, we select the maximum * read payload length to be 180 bytes. */ #define FT260_RD_DATA_MAX (180) #define FT260_WR_DATA_MAX (60) /* * Device interface configuration. * The FT260 has 2 interfaces that are controlled by DCNF0 and DCNF1 pins. * First implementes USB HID to I2C bridge function and * second - USB HID to UART bridge function. */ enum { FT260_MODE_ALL = 0x00, FT260_MODE_I2C = 0x01, FT260_MODE_UART = 0x02, FT260_MODE_BOTH = 0x03, }; /* Control pipe */ enum { FT260_GET_RQST_TYPE = 0xA1, FT260_GET_REPORT = 0x01, FT260_SET_RQST_TYPE = 0x21, FT260_SET_REPORT = 0x09, FT260_FEATURE = 0x03, }; /* Report IDs / Feature In */ enum { FT260_CHIP_VERSION = 0xA0, FT260_SYSTEM_SETTINGS = 0xA1, FT260_I2C_STATUS = 0xC0, FT260_I2C_READ_REQ = 0xC2, FT260_I2C_REPORT_MIN = 0xD0, FT260_I2C_REPORT_MAX = 0xDE, FT260_GPIO = 0xB0, FT260_UART_INTERRUPT_STATUS = 0xB1, FT260_UART_STATUS = 0xE0, FT260_UART_RI_DCD_STATUS = 0xE1, FT260_UART_REPORT = 0xF0, }; /* Feature Out */ enum { FT260_SET_CLOCK = 0x01, FT260_SET_I2C_MODE = 0x02, FT260_SET_UART_MODE = 0x03, FT260_ENABLE_INTERRUPT = 0x05, FT260_SELECT_GPIO2_FUNC = 0x06, FT260_ENABLE_UART_DCD_RI = 0x07, FT260_SELECT_GPIOA_FUNC = 0x08, FT260_SELECT_GPIOG_FUNC = 0x09, FT260_SET_INTERRUPT_TRIGGER = 0x0A, FT260_SET_SUSPEND_OUT_POLAR = 0x0B, FT260_ENABLE_UART_RI_WAKEUP = 0x0C, FT260_SET_UART_RI_WAKEUP_CFG = 0x0D, FT260_SET_I2C_RESET = 0x20, FT260_SET_I2C_CLOCK_SPEED = 0x22, FT260_SET_UART_RESET = 0x40, FT260_SET_UART_CONFIG = 0x41, FT260_SET_UART_BAUD_RATE = 0x42, FT260_SET_UART_DATA_BIT = 0x43, FT260_SET_UART_PARITY = 0x44, FT260_SET_UART_STOP_BIT = 0x45, FT260_SET_UART_BREAKING = 0x46, FT260_SET_UART_XON_XOFF = 0x49, }; /* Response codes in I2C status report */ enum { FT260_I2C_STATUS_SUCCESS = 0x00, FT260_I2C_STATUS_CTRL_BUSY = 0x01, FT260_I2C_STATUS_ERROR = 0x02, FT260_I2C_STATUS_ADDR_NO_ACK = 0x04, FT260_I2C_STATUS_DATA_NO_ACK = 0x08, FT260_I2C_STATUS_ARBITR_LOST = 0x10, FT260_I2C_STATUS_CTRL_IDLE = 0x20, FT260_I2C_STATUS_BUS_BUSY = 0x40, }; /* I2C Conditions flags */ enum { FT260_FLAG_NONE = 0x00, FT260_FLAG_START = 0x02, FT260_FLAG_START_REPEATED = 0x03, FT260_FLAG_STOP = 0x04, FT260_FLAG_START_STOP = 0x06, FT260_FLAG_START_STOP_REPEATED = 0x07, }; #define FT260_SET_REQUEST_VALUE(report_id) ((FT260_FEATURE << 8) | report_id) /* Feature In reports */ struct ft260_get_chip_version_report { u8 report; /* FT260_CHIP_VERSION */ u8 chip_code[4]; /* FTDI chip identification code */ u8 reserved[8]; } __packed; struct ft260_get_system_status_report { u8 report; /* FT260_SYSTEM_SETTINGS */ u8 chip_mode; /* DCNF0 and DCNF1 status, bits 0-1 */ u8 clock_ctl; /* 0 - 12MHz, 1 - 24MHz, 2 - 48MHz */ u8 suspend_status; /* 0 - not suspended, 1 - suspended */ u8 pwren_status; /* 0 - FT260 is not ready, 1 - ready */ u8 i2c_enable; /* 0 - disabled, 1 - enabled */ u8 uart_mode; /* 0 - OFF; 1 - RTS_CTS, 2 - DTR_DSR, */ /* 3 - XON_XOFF, 4 - No flow control */ u8 hid_over_i2c_en; /* 0 - disabled, 1 - enabled */ u8 gpio2_function; /* 0 - GPIO, 1 - SUSPOUT, */ /* 2 - PWREN, 4 - TX_LED */ u8 gpioA_function; /* 0 - GPIO, 3 - TX_ACTIVE, 4 - TX_LED */ u8 gpioG_function; /* 0 - GPIO, 2 - PWREN, */ /* 5 - RX_LED, 6 - BCD_DET */ u8 suspend_out_pol; /* 0 - active-high, 1 - active-low */ u8 enable_wakeup_int; /* 0 - disabled, 1 - enabled */ u8 intr_cond; /* Interrupt trigger conditions */ u8 power_saving_en; /* 0 - disabled, 1 - enabled */ u8 reserved[10]; } __packed; struct ft260_get_i2c_status_report { u8 report; /* FT260_I2C_STATUS */ u8 bus_status; /* I2C bus status */ __le16 clock; /* I2C bus clock in range 60-3400 KHz */ u8 reserved; } __packed; /* Feature Out reports */ struct ft260_set_system_clock_report { u8 report; /* FT260_SYSTEM_SETTINGS */ u8 request; /* FT260_SET_CLOCK */ u8 clock_ctl; /* 0 - 12MHz, 1 - 24MHz, 2 - 48MHz */ } __packed; struct ft260_set_i2c_mode_report { u8 report; /* FT260_SYSTEM_SETTINGS */ u8 request; /* FT260_SET_I2C_MODE */ u8 i2c_enable; /* 0 - disabled, 1 - enabled */ } __packed; struct ft260_set_uart_mode_report { u8 report; /* FT260_SYSTEM_SETTINGS */ u8 request; /* FT260_SET_UART_MODE */ u8 uart_mode; /* 0 - OFF; 1 - RTS_CTS, 2 - DTR_DSR, */ /* 3 - XON_XOFF, 4 - No flow control */ } __packed; struct ft260_set_i2c_reset_report { u8 report; /* FT260_SYSTEM_SETTINGS */ u8 request; /* FT260_SET_I2C_RESET */ } __packed; struct ft260_set_i2c_speed_report { u8 report; /* FT260_SYSTEM_SETTINGS */ u8 request; /* FT260_SET_I2C_CLOCK_SPEED */ __le16 clock; /* I2C bus clock in range 60-3400 KHz */ } __packed; /* Data transfer reports */ struct ft260_i2c_write_request_report { u8 report; /* FT260_I2C_REPORT */ u8 address; /* 7-bit I2C address */ u8 flag; /* I2C transaction condition */ u8 length; /* data payload length */ u8 data[FT260_WR_DATA_MAX]; /* data payload */ } __packed; struct ft260_i2c_read_request_report { u8 report; /* FT260_I2C_READ_REQ */ u8 address; /* 7-bit I2C address */ u8 flag; /* I2C transaction condition */ __le16 length; /* data payload length */ } __packed; struct ft260_i2c_input_report { u8 report; /* FT260_I2C_REPORT */ u8 length; /* data payload length */ u8 data[2]; /* data payload */ } __packed; static const struct hid_device_id ft260_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_FUTURE_TECHNOLOGY, USB_DEVICE_ID_FT260) }, { /* END OF LIST */ } }; MODULE_DEVICE_TABLE(hid, ft260_devices); struct ft260_device { struct i2c_adapter adap; struct hid_device *hdev; struct completion wait; struct mutex lock; u8 write_buf[FT260_REPORT_MAX_LENGTH]; unsigned long need_wakeup_at; u8 *read_buf; u16 read_idx; u16 read_len; u16 clock; }; static int ft260_hid_feature_report_get(struct hid_device *hdev, unsigned char report_id, u8 *data, size_t len) { u8 *buf; int ret; buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; ret = hid_hw_raw_request(hdev, report_id, buf, len, HID_FEATURE_REPORT, HID_REQ_GET_REPORT); if (likely(ret == len)) memcpy(data, buf, len); else if (ret >= 0) ret = -EIO; kfree(buf); return ret; } static int ft260_hid_feature_report_set(struct hid_device *hdev, u8 *data, size_t len) { u8 *buf; int ret; buf = kmemdup(data, len, GFP_KERNEL); if (!buf) return -ENOMEM; buf[0] = FT260_SYSTEM_SETTINGS; ret = hid_hw_raw_request(hdev, buf[0], buf, len, HID_FEATURE_REPORT, HID_REQ_SET_REPORT); kfree(buf); return ret; } static int ft260_i2c_reset(struct hid_device *hdev) { struct ft260_set_i2c_reset_report report; int ret; report.request = FT260_SET_I2C_RESET; ret = ft260_hid_feature_report_set(hdev, (u8 *)&report, sizeof(report)); if (ret < 0) { hid_err(hdev, "failed to reset I2C controller: %d\n", ret); return ret; } ft260_dbg("done\n"); return ret; } static int ft260_xfer_status(struct ft260_device *dev, u8 bus_busy) { struct hid_device *hdev = dev->hdev; struct ft260_get_i2c_status_report report; int ret; if (time_is_before_jiffies(dev->need_wakeup_at)) { ret = ft260_hid_feature_report_get(hdev, FT260_I2C_STATUS, (u8 *)&report, sizeof(report)); if (unlikely(ret < 0)) { hid_err(hdev, "failed to retrieve status: %d, no wakeup\n", ret); } else { dev->need_wakeup_at = jiffies + msecs_to_jiffies(FT260_WAKEUP_NEEDED_AFTER_MS); ft260_dbg("bus_status %#02x, wakeup\n", report.bus_status); } } ret = ft260_hid_feature_report_get(hdev, FT260_I2C_STATUS, (u8 *)&report, sizeof(report)); if (unlikely(ret < 0)) { hid_err(hdev, "failed to retrieve status: %d\n", ret); return ret; } dev->clock = le16_to_cpu(report.clock); ft260_dbg("bus_status %#02x, clock %u\n", report.bus_status, dev->clock); if (report.bus_status & (FT260_I2C_STATUS_CTRL_BUSY | bus_busy)) return -EAGAIN; /* * The error condition (bit 1) is a status bit reflecting any * error conditions. When any of the bits 2, 3, or 4 are raised * to 1, bit 1 is also set to 1. */ if (report.bus_status & FT260_I2C_STATUS_ERROR) { hid_err(hdev, "i2c bus error: %#02x\n", report.bus_status); return -EIO; } return 0; } static int ft260_hid_output_report(struct hid_device *hdev, u8 *data, size_t len) { u8 *buf; int ret; buf = kmemdup(data, len, GFP_KERNEL); if (!buf) return -ENOMEM; ret = hid_hw_output_report(hdev, buf, len); kfree(buf); return ret; } static int ft260_hid_output_report_check_status(struct ft260_device *dev, u8 *data, int len) { u8 bus_busy; int ret, usec, try = 100; struct hid_device *hdev = dev->hdev; struct ft260_i2c_write_request_report *rep = (struct ft260_i2c_write_request_report *)data; ret = ft260_hid_output_report(hdev, data, len); if (ret < 0) { hid_err(hdev, "%s: failed to start transfer, ret %d\n", __func__, ret); ft260_i2c_reset(hdev); return ret; } /* transfer time = 1 / clock(KHz) * 9 bits * bytes */ usec = len * 9000 / dev->clock; if (usec > 2000) { usec -= 1500; usleep_range(usec, usec + 100); ft260_dbg("wait %d usec, len %d\n", usec, len); } /* * Do not check the busy bit for combined transactions * since the controller keeps the bus busy between writing * and reading IOs to ensure an atomic operation. */ if (rep->flag == FT260_FLAG_START) bus_busy = 0; else bus_busy = FT260_I2C_STATUS_BUS_BUSY; do { ret = ft260_xfer_status(dev, bus_busy); if (ret != -EAGAIN) break; } while (--try); if (ret == 0) return 0; ft260_i2c_reset(hdev); return -EIO; } static int ft260_i2c_write(struct ft260_device *dev, u8 addr, u8 *data, int len, u8 flag) { int ret, wr_len, idx = 0; struct hid_device *hdev = dev->hdev; struct ft260_i2c_write_request_report *rep = (struct ft260_i2c_write_request_report *)dev->write_buf; if (len < 1) return -EINVAL; rep->flag = FT260_FLAG_START; do { if (len <= FT260_WR_DATA_MAX) { wr_len = len; if (flag == FT260_FLAG_START_STOP) rep->flag |= FT260_FLAG_STOP; } else { wr_len = FT260_WR_DATA_MAX; } rep->report = FT260_I2C_DATA_REPORT_ID(wr_len); rep->address = addr; rep->length = wr_len; memcpy(rep->data, &data[idx], wr_len); ft260_dbg("rep %#02x addr %#02x off %d len %d wlen %d flag %#x d[0] %#02x\n", rep->report, addr, idx, len, wr_len, rep->flag, data[0]); ret = ft260_hid_output_report_check_status(dev, (u8 *)rep, wr_len + 4); if (ret < 0) { hid_err(hdev, "%s: failed with %d\n", __func__, ret); return ret; } len -= wr_len; idx += wr_len; rep->flag = 0; } while (len > 0); return 0; } static int ft260_smbus_write(struct ft260_device *dev, u8 addr, u8 cmd, u8 *data, u8 data_len, u8 flag) { int ret = 0; int len = 4; struct ft260_i2c_write_request_report *rep = (struct ft260_i2c_write_request_report *)dev->write_buf; if (data_len >= sizeof(rep->data)) return -EINVAL; rep->address = addr; rep->data[0] = cmd; rep->length = data_len + 1; rep->flag = flag; len += rep->length; rep->report = FT260_I2C_DATA_REPORT_ID(len); if (data_len > 0) memcpy(&rep->data[1], data, data_len); ft260_dbg("rep %#02x addr %#02x cmd %#02x datlen %d replen %d\n", rep->report, addr, cmd, rep->length, len); ret = ft260_hid_output_report_check_status(dev, (u8 *)rep, len); return ret; } static int ft260_i2c_read(struct ft260_device *dev, u8 addr, u8 *data, u16 len, u8 flag) { u16 rd_len; u16 rd_data_max = 60; int timeout, ret = 0; struct ft260_i2c_read_request_report rep; struct hid_device *hdev = dev->hdev; u8 bus_busy = 0; if ((flag & FT260_FLAG_START_REPEATED) == FT260_FLAG_START_REPEATED) flag = FT260_FLAG_START_REPEATED; else flag = FT260_FLAG_START; do { if (len <= rd_data_max) { rd_len = len; flag |= FT260_FLAG_STOP; } else { rd_len = rd_data_max; } rd_data_max = FT260_RD_DATA_MAX; rep.report = FT260_I2C_READ_REQ; rep.length = cpu_to_le16(rd_len); rep.address = addr; rep.flag = flag; ft260_dbg("rep %#02x addr %#02x len %d rlen %d flag %#x\n", rep.report, rep.address, len, rd_len, flag); reinit_completion(&dev->wait); dev->read_idx = 0; dev->read_buf = data; dev->read_len = rd_len; ret = ft260_hid_output_report(hdev, (u8 *)&rep, sizeof(rep)); if (ret < 0) { hid_err(hdev, "%s: failed with %d\n", __func__, ret); goto ft260_i2c_read_exit; } timeout = msecs_to_jiffies(5000); if (!wait_for_completion_timeout(&dev->wait, timeout)) { ret = -ETIMEDOUT; ft260_i2c_reset(hdev); goto ft260_i2c_read_exit; } dev->read_buf = NULL; if (flag & FT260_FLAG_STOP) bus_busy = FT260_I2C_STATUS_BUS_BUSY; ret = ft260_xfer_status(dev, bus_busy); if (ret < 0) { ret = -EIO; ft260_i2c_reset(hdev); goto ft260_i2c_read_exit; } len -= rd_len; data += rd_len; flag = 0; } while (len > 0); ft260_i2c_read_exit: dev->read_buf = NULL; return ret; } /* * A random read operation is implemented as a dummy write operation, followed * by a current address read operation. The dummy write operation is used to * load the target byte address into the current byte address counter, from * which the subsequent current address read operation then reads. */ static int ft260_i2c_write_read(struct ft260_device *dev, struct i2c_msg *msgs) { int ret; int wr_len = msgs[0].len; int rd_len = msgs[1].len; struct hid_device *hdev = dev->hdev; u8 addr = msgs[0].addr; u16 read_off = 0; if (wr_len > 2) { hid_err(hdev, "%s: invalid wr_len: %d\n", __func__, wr_len); return -EOPNOTSUPP; } if (ft260_debug) { if (wr_len == 2) read_off = be16_to_cpu(*(__be16 *)msgs[0].buf); else read_off = *msgs[0].buf; pr_info("%s: off %#x rlen %d wlen %d\n", __func__, read_off, rd_len, wr_len); } ret = ft260_i2c_write(dev, addr, msgs[0].buf, wr_len, FT260_FLAG_START); if (ret < 0) return ret; ret = ft260_i2c_read(dev, addr, msgs[1].buf, rd_len, FT260_FLAG_START_STOP_REPEATED); if (ret < 0) return ret; return 0; } static int ft260_i2c_xfer(struct i2c_adapter *adapter, struct i2c_msg *msgs, int num) { int ret; struct ft260_device *dev = i2c_get_adapdata(adapter); struct hid_device *hdev = dev->hdev; mutex_lock(&dev->lock); ret = hid_hw_power(hdev, PM_HINT_FULLON); if (ret < 0) { hid_err(hdev, "failed to enter FULLON power mode: %d\n", ret); mutex_unlock(&dev->lock); return ret; } if (num == 1) { if (msgs->flags & I2C_M_RD) ret = ft260_i2c_read(dev, msgs->addr, msgs->buf, msgs->len, FT260_FLAG_START_STOP); else ret = ft260_i2c_write(dev, msgs->addr, msgs->buf, msgs->len, FT260_FLAG_START_STOP); if (ret < 0) goto i2c_exit; } else { /* Combined write then read message */ ret = ft260_i2c_write_read(dev, msgs); if (ret < 0) goto i2c_exit; } ret = num; i2c_exit: hid_hw_power(hdev, PM_HINT_NORMAL); mutex_unlock(&dev->lock); return ret; } static int ft260_smbus_xfer(struct i2c_adapter *adapter, u16 addr, u16 flags, char read_write, u8 cmd, int size, union i2c_smbus_data *data) { int ret; struct ft260_device *dev = i2c_get_adapdata(adapter); struct hid_device *hdev = dev->hdev; ft260_dbg("smbus size %d\n", size); mutex_lock(&dev->lock); ret = hid_hw_power(hdev, PM_HINT_FULLON); if (ret < 0) { hid_err(hdev, "power management error: %d\n", ret); mutex_unlock(&dev->lock); return ret; } switch (size) { case I2C_SMBUS_BYTE: if (read_write == I2C_SMBUS_READ) ret = ft260_i2c_read(dev, addr, &data->byte, 1, FT260_FLAG_START_STOP); else ret = ft260_smbus_write(dev, addr, cmd, NULL, 0, FT260_FLAG_START_STOP); break; case I2C_SMBUS_BYTE_DATA: if (read_write == I2C_SMBUS_READ) { ret = ft260_smbus_write(dev, addr, cmd, NULL, 0, FT260_FLAG_START); if (ret) goto smbus_exit; ret = ft260_i2c_read(dev, addr, &data->byte, 1, FT260_FLAG_START_STOP_REPEATED); } else { ret = ft260_smbus_write(dev, addr, cmd, &data->byte, 1, FT260_FLAG_START_STOP); } break; case I2C_SMBUS_WORD_DATA: if (read_write == I2C_SMBUS_READ) { ret = ft260_smbus_write(dev, addr, cmd, NULL, 0, FT260_FLAG_START); if (ret) goto smbus_exit; ret = ft260_i2c_read(dev, addr, (u8 *)&data->word, 2, FT260_FLAG_START_STOP_REPEATED); } else { ret = ft260_smbus_write(dev, addr, cmd, (u8 *)&data->word, 2, FT260_FLAG_START_STOP); } break; case I2C_SMBUS_BLOCK_DATA: if (read_write == I2C_SMBUS_READ) { ret = ft260_smbus_write(dev, addr, cmd, NULL, 0, FT260_FLAG_START); if (ret) goto smbus_exit; ret = ft260_i2c_read(dev, addr, data->block, data->block[0] + 1, FT260_FLAG_START_STOP_REPEATED); } else { ret = ft260_smbus_write(dev, addr, cmd, data->block, data->block[0] + 1, FT260_FLAG_START_STOP); } break; case I2C_SMBUS_I2C_BLOCK_DATA: if (read_write == I2C_SMBUS_READ) { ret = ft260_smbus_write(dev, addr, cmd, NULL, 0, FT260_FLAG_START); if (ret) goto smbus_exit; ret = ft260_i2c_read(dev, addr, data->block + 1, data->block[0], FT260_FLAG_START_STOP_REPEATED); } else { ret = ft260_smbus_write(dev, addr, cmd, data->block + 1, data->block[0], FT260_FLAG_START_STOP); } break; default: hid_err(hdev, "unsupported smbus transaction size %d\n", size); ret = -EOPNOTSUPP; } smbus_exit: hid_hw_power(hdev, PM_HINT_NORMAL); mutex_unlock(&dev->lock); return ret; } static u32 ft260_functionality(struct i2c_adapter *adap) { return I2C_FUNC_I2C | I2C_FUNC_SMBUS_BYTE | I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA | I2C_FUNC_SMBUS_BLOCK_DATA | I2C_FUNC_SMBUS_I2C_BLOCK; } static const struct i2c_adapter_quirks ft260_i2c_quirks = { .flags = I2C_AQ_COMB_WRITE_THEN_READ, .max_comb_1st_msg_len = 2, }; static const struct i2c_algorithm ft260_i2c_algo = { .master_xfer = ft260_i2c_xfer, .smbus_xfer = ft260_smbus_xfer, .functionality = ft260_functionality, }; static int ft260_get_system_config(struct hid_device *hdev, struct ft260_get_system_status_report *cfg) { int ret; int len = sizeof(struct ft260_get_system_status_report); ret = ft260_hid_feature_report_get(hdev, FT260_SYSTEM_SETTINGS, (u8 *)cfg, len); if (ret < 0) { hid_err(hdev, "failed to retrieve system status\n"); return ret; } return 0; } static int ft260_is_interface_enabled(struct hid_device *hdev) { struct ft260_get_system_status_report cfg; struct usb_interface *usbif = to_usb_interface(hdev->dev.parent); int interface = usbif->cur_altsetting->desc.bInterfaceNumber; int ret; ret = ft260_get_system_config(hdev, &cfg); if (ret < 0) return ret; ft260_dbg("interface: 0x%02x\n", interface); ft260_dbg("chip mode: 0x%02x\n", cfg.chip_mode); ft260_dbg("clock_ctl: 0x%02x\n", cfg.clock_ctl); ft260_dbg("i2c_enable: 0x%02x\n", cfg.i2c_enable); ft260_dbg("uart_mode: 0x%02x\n", cfg.uart_mode); switch (cfg.chip_mode) { case FT260_MODE_ALL: case FT260_MODE_BOTH: if (interface == 1) hid_info(hdev, "uart interface is not supported\n"); else ret = 1; break; case FT260_MODE_UART: hid_info(hdev, "uart interface is not supported\n"); break; case FT260_MODE_I2C: ret = 1; break; } return ret; } static int ft260_byte_show(struct hid_device *hdev, int id, u8 *cfg, int len, u8 *field, u8 *buf) { int ret; ret = ft260_hid_feature_report_get(hdev, id, cfg, len); if (ret < 0) return ret; return scnprintf(buf, PAGE_SIZE, "%d\n", *field); } static int ft260_word_show(struct hid_device *hdev, int id, u8 *cfg, int len, __le16 *field, u8 *buf) { int ret; ret = ft260_hid_feature_report_get(hdev, id, cfg, len); if (ret < 0) return ret; return scnprintf(buf, PAGE_SIZE, "%d\n", le16_to_cpu(*field)); } #define FT260_ATTR_SHOW(name, reptype, id, type, func) \ static ssize_t name##_show(struct device *kdev, \ struct device_attribute *attr, char *buf) \ { \ struct reptype rep; \ struct hid_device *hdev = to_hid_device(kdev); \ type *field = &rep.name; \ int len = sizeof(rep); \ \ return func(hdev, id, (u8 *)&rep, len, field, buf); \ } #define FT260_SSTAT_ATTR_SHOW(name) \ FT260_ATTR_SHOW(name, ft260_get_system_status_report, \ FT260_SYSTEM_SETTINGS, u8, ft260_byte_show) #define FT260_I2CST_ATTR_SHOW(name) \ FT260_ATTR_SHOW(name, ft260_get_i2c_status_report, \ FT260_I2C_STATUS, __le16, ft260_word_show) #define FT260_ATTR_STORE(name, reptype, id, req, type, ctype, func) \ static ssize_t name##_store(struct device *kdev, \ struct device_attribute *attr, \ const char *buf, size_t count) \ { \ struct reptype rep; \ struct hid_device *hdev = to_hid_device(kdev); \ type name; \ int ret; \ \ if (!func(buf, 10, (ctype *)&name)) { \ rep.name = name; \ rep.report = id; \ rep.request = req; \ ret = ft260_hid_feature_report_set(hdev, (u8 *)&rep, \ sizeof(rep)); \ if (!ret) \ ret = count; \ } else { \ ret = -EINVAL; \ } \ return ret; \ } #define FT260_BYTE_ATTR_STORE(name, reptype, req) \ FT260_ATTR_STORE(name, reptype, FT260_SYSTEM_SETTINGS, req, \ u8, u8, kstrtou8) #define FT260_WORD_ATTR_STORE(name, reptype, req) \ FT260_ATTR_STORE(name, reptype, FT260_SYSTEM_SETTINGS, req, \ __le16, u16, kstrtou16) FT260_SSTAT_ATTR_SHOW(chip_mode); static DEVICE_ATTR_RO(chip_mode); FT260_SSTAT_ATTR_SHOW(pwren_status); static DEVICE_ATTR_RO(pwren_status); FT260_SSTAT_ATTR_SHOW(suspend_status); static DEVICE_ATTR_RO(suspend_status); FT260_SSTAT_ATTR_SHOW(hid_over_i2c_en); static DEVICE_ATTR_RO(hid_over_i2c_en); FT260_SSTAT_ATTR_SHOW(power_saving_en); static DEVICE_ATTR_RO(power_saving_en); FT260_SSTAT_ATTR_SHOW(i2c_enable); FT260_BYTE_ATTR_STORE(i2c_enable, ft260_set_i2c_mode_report, FT260_SET_I2C_MODE); static DEVICE_ATTR_RW(i2c_enable); FT260_SSTAT_ATTR_SHOW(uart_mode); FT260_BYTE_ATTR_STORE(uart_mode, ft260_set_uart_mode_report, FT260_SET_UART_MODE); static DEVICE_ATTR_RW(uart_mode); FT260_SSTAT_ATTR_SHOW(clock_ctl); FT260_BYTE_ATTR_STORE(clock_ctl, ft260_set_system_clock_report, FT260_SET_CLOCK); static DEVICE_ATTR_RW(clock_ctl); FT260_I2CST_ATTR_SHOW(clock); FT260_WORD_ATTR_STORE(clock, ft260_set_i2c_speed_report, FT260_SET_I2C_CLOCK_SPEED); static DEVICE_ATTR_RW(clock); static ssize_t i2c_reset_store(struct device *kdev, struct device_attribute *attr, const char *buf, size_t count) { struct hid_device *hdev = to_hid_device(kdev); int ret = ft260_i2c_reset(hdev); if (ret) return ret; return count; } static DEVICE_ATTR_WO(i2c_reset); static const struct attribute_group ft260_attr_group = { .attrs = (struct attribute *[]) { &dev_attr_chip_mode.attr, &dev_attr_pwren_status.attr, &dev_attr_suspend_status.attr, &dev_attr_hid_over_i2c_en.attr, &dev_attr_power_saving_en.attr, &dev_attr_i2c_enable.attr, &dev_attr_uart_mode.attr, &dev_attr_clock_ctl.attr, &dev_attr_i2c_reset.attr, &dev_attr_clock.attr, NULL } }; static int ft260_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct ft260_device *dev; struct ft260_get_chip_version_report version; int ret; if (!hid_is_usb(hdev)) return -EINVAL; dev = devm_kzalloc(&hdev->dev, sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; ret = hid_parse(hdev); if (ret) { hid_err(hdev, "failed to parse HID\n"); return ret; } ret = hid_hw_start(hdev, 0); if (ret) { hid_err(hdev, "failed to start HID HW\n"); return ret; } ret = hid_hw_open(hdev); if (ret) { hid_err(hdev, "failed to open HID HW\n"); goto err_hid_stop; } ret = ft260_hid_feature_report_get(hdev, FT260_CHIP_VERSION, (u8 *)&version, sizeof(version)); if (ret < 0) { hid_err(hdev, "failed to retrieve chip version\n"); goto err_hid_close; } hid_info(hdev, "chip code: %02x%02x %02x%02x\n", version.chip_code[0], version.chip_code[1], version.chip_code[2], version.chip_code[3]); ret = ft260_is_interface_enabled(hdev); if (ret <= 0) goto err_hid_close; hid_info(hdev, "USB HID v%x.%02x Device [%s] on %s\n", hdev->version >> 8, hdev->version & 0xff, hdev->name, hdev->phys); hid_set_drvdata(hdev, dev); dev->hdev = hdev; dev->adap.owner = THIS_MODULE; dev->adap.class = I2C_CLASS_HWMON; dev->adap.algo = &ft260_i2c_algo; dev->adap.quirks = &ft260_i2c_quirks; dev->adap.dev.parent = &hdev->dev; snprintf(dev->adap.name, sizeof(dev->adap.name), "FT260 usb-i2c bridge"); mutex_init(&dev->lock); init_completion(&dev->wait); ret = ft260_xfer_status(dev, FT260_I2C_STATUS_BUS_BUSY); if (ret) ft260_i2c_reset(hdev); i2c_set_adapdata(&dev->adap, dev); ret = i2c_add_adapter(&dev->adap); if (ret) { hid_err(hdev, "failed to add i2c adapter\n"); goto err_hid_close; } ret = sysfs_create_group(&hdev->dev.kobj, &ft260_attr_group); if (ret < 0) { hid_err(hdev, "failed to create sysfs attrs\n"); goto err_i2c_free; } return 0; err_i2c_free: i2c_del_adapter(&dev->adap); err_hid_close: hid_hw_close(hdev); err_hid_stop: hid_hw_stop(hdev); return ret; } static void ft260_remove(struct hid_device *hdev) { struct ft260_device *dev = hid_get_drvdata(hdev); if (!dev) return; sysfs_remove_group(&hdev->dev.kobj, &ft260_attr_group); i2c_del_adapter(&dev->adap); hid_hw_close(hdev); hid_hw_stop(hdev); } static int ft260_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct ft260_device *dev = hid_get_drvdata(hdev); struct ft260_i2c_input_report *xfer = (void *)data; if (xfer->report >= FT260_I2C_REPORT_MIN && xfer->report <= FT260_I2C_REPORT_MAX) { ft260_dbg("i2c resp: rep %#02x len %d\n", xfer->report, xfer->length); if ((dev->read_buf == NULL) || (xfer->length > dev->read_len - dev->read_idx)) { hid_err(hdev, "unexpected report %#02x, length %d\n", xfer->report, xfer->length); return -1; } memcpy(&dev->read_buf[dev->read_idx], &xfer->data, xfer->length); dev->read_idx += xfer->length; if (dev->read_idx == dev->read_len) complete(&dev->wait); } else { hid_err(hdev, "unhandled report %#02x\n", xfer->report); } return 0; } static struct hid_driver ft260_driver = { .name = "ft260", .id_table = ft260_devices, .probe = ft260_probe, .remove = ft260_remove, .raw_event = ft260_raw_event, }; module_hid_driver(ft260_driver); MODULE_DESCRIPTION("FTDI FT260 USB HID to I2C host bridge"); MODULE_AUTHOR("Michael Zaidman <michael.zaidman@gmail.com>"); MODULE_LICENSE("GPL v2");
51 4 140 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0-only /* * Unified UUID/GUID definition * * Copyright (C) 2009, 2016 Intel Corp. * Huang Ying <ying.huang@intel.com> */ #include <linux/kernel.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/uuid.h> #include <linux/random.h> const guid_t guid_null; EXPORT_SYMBOL(guid_null); const uuid_t uuid_null; EXPORT_SYMBOL(uuid_null); const u8 guid_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; const u8 uuid_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; /** * generate_random_uuid - generate a random UUID * @uuid: where to put the generated UUID * * Random UUID interface * * Used to create a Boot ID or a filesystem UUID/GUID, but can be * useful for other kernel drivers. */ void generate_random_uuid(unsigned char uuid[16]) { get_random_bytes(uuid, 16); /* Set UUID version to 4 --- truly random generation */ uuid[6] = (uuid[6] & 0x0F) | 0x40; /* Set the UUID variant to DCE */ uuid[8] = (uuid[8] & 0x3F) | 0x80; } EXPORT_SYMBOL(generate_random_uuid); void generate_random_guid(unsigned char guid[16]) { get_random_bytes(guid, 16); /* Set GUID version to 4 --- truly random generation */ guid[7] = (guid[7] & 0x0F) | 0x40; /* Set the GUID variant to DCE */ guid[8] = (guid[8] & 0x3F) | 0x80; } EXPORT_SYMBOL(generate_random_guid); static void __uuid_gen_common(__u8 b[16]) { get_random_bytes(b, 16); /* reversion 0b10 */ b[8] = (b[8] & 0x3F) | 0x80; } void guid_gen(guid_t *lu) { __uuid_gen_common(lu->b); /* version 4 : random generation */ lu->b[7] = (lu->b[7] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(guid_gen); void uuid_gen(uuid_t *bu) { __uuid_gen_common(bu->b); /* version 4 : random generation */ bu->b[6] = (bu->b[6] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(uuid_gen); /** * uuid_is_valid - checks if a UUID string is valid * @uuid: UUID string to check * * Description: * It checks if the UUID string is following the format: * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx * * where x is a hex digit. * * Return: true if input is valid UUID string. */ bool uuid_is_valid(const char *uuid) { unsigned int i; for (i = 0; i < UUID_STRING_LEN; i++) { if (i == 8 || i == 13 || i == 18 || i == 23) { if (uuid[i] != '-') return false; } else if (!isxdigit(uuid[i])) { return false; } } return true; } EXPORT_SYMBOL(uuid_is_valid); static int __uuid_parse(const char *uuid, __u8 b[16], const u8 ei[16]) { static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34}; unsigned int i; if (!uuid_is_valid(uuid)) return -EINVAL; for (i = 0; i < 16; i++) { int hi = hex_to_bin(uuid[si[i] + 0]); int lo = hex_to_bin(uuid[si[i] + 1]); b[ei[i]] = (hi << 4) | lo; } return 0; } int guid_parse(const char *uuid, guid_t *u) { return __uuid_parse(uuid, u->b, guid_index); } EXPORT_SYMBOL(guid_parse); int uuid_parse(const char *uuid, uuid_t *u) { return __uuid_parse(uuid, u->b, uuid_index); } EXPORT_SYMBOL(uuid_parse);
487 553 2950 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/pagevec.h * * In many places it is efficient to batch an operation up against multiple * folios. A folio_batch is a container which is used for that. */ #ifndef _LINUX_PAGEVEC_H #define _LINUX_PAGEVEC_H #include <linux/types.h> /* 31 pointers + header align the folio_batch structure to a power of two */ #define PAGEVEC_SIZE 31 struct folio; /** * struct folio_batch - A collection of folios. * * The folio_batch is used to amortise the cost of retrieving and * operating on a set of folios. The order of folios in the batch may be * significant (eg delete_from_page_cache_batch()). Some users of the * folio_batch store "exceptional" entries in it which can be removed * by calling folio_batch_remove_exceptionals(). */ struct folio_batch { unsigned char nr; unsigned char i; bool percpu_pvec_drained; struct folio *folios[PAGEVEC_SIZE]; }; /** * folio_batch_init() - Initialise a batch of folios * @fbatch: The folio batch. * * A freshly initialised folio_batch contains zero folios. */ static inline void folio_batch_init(struct folio_batch *fbatch) { fbatch->nr = 0; fbatch->i = 0; fbatch->percpu_pvec_drained = false; } static inline void folio_batch_reinit(struct folio_batch *fbatch) { fbatch->nr = 0; fbatch->i = 0; } static inline unsigned int folio_batch_count(struct folio_batch *fbatch) { return fbatch->nr; } static inline unsigned int folio_batch_space(struct folio_batch *fbatch) { return PAGEVEC_SIZE - fbatch->nr; } /** * folio_batch_add() - Add a folio to a batch. * @fbatch: The folio batch. * @folio: The folio to add. * * The folio is added to the end of the batch. * The batch must have previously been initialised using folio_batch_init(). * * Return: The number of slots still available. */ static inline unsigned folio_batch_add(struct folio_batch *fbatch, struct folio *folio) { fbatch->folios[fbatch->nr++] = folio; return folio_batch_space(fbatch); } /** * folio_batch_next - Return the next folio to process. * @fbatch: The folio batch being processed. * * Use this function to implement a queue of folios. * * Return: The next folio in the queue, or NULL if the queue is empty. */ static inline struct folio *folio_batch_next(struct folio_batch *fbatch) { if (fbatch->i == fbatch->nr) return NULL; return fbatch->folios[fbatch->i++]; } void __folio_batch_release(struct folio_batch *pvec); static inline void folio_batch_release(struct folio_batch *fbatch) { if (folio_batch_count(fbatch)) __folio_batch_release(fbatch); } void folio_batch_remove_exceptionals(struct folio_batch *fbatch); #endif /* _LINUX_PAGEVEC_H */
191 81 83 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_SPACE_INFO_H #define BTRFS_SPACE_INFO_H #include <trace/events/btrfs.h> #include <linux/spinlock.h> #include <linux/list.h> #include <linux/kobject.h> #include <linux/lockdep.h> #include <linux/wait.h> #include <linux/rwsem.h> #include "volumes.h" struct btrfs_fs_info; struct btrfs_block_group; /* * Different levels for to flush space when doing space reservations. * * The higher the level, the more methods we try to reclaim space. */ enum btrfs_reserve_flush_enum { /* If we are in the transaction, we can't flush anything.*/ BTRFS_RESERVE_NO_FLUSH, /* * Flush space by: * - Running delayed inode items * - Allocating a new chunk */ BTRFS_RESERVE_FLUSH_LIMIT, /* * Flush space by: * - Running delayed inode items * - Running delayed refs * - Running delalloc and waiting for ordered extents * - Allocating a new chunk * - Committing transaction */ BTRFS_RESERVE_FLUSH_EVICT, /* * Flush space by above mentioned methods and by: * - Running delayed iputs * - Committing transaction * * Can be interrupted by a fatal signal. */ BTRFS_RESERVE_FLUSH_DATA, BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, BTRFS_RESERVE_FLUSH_ALL, /* * Pretty much the same as FLUSH_ALL, but can also steal space from * global rsv. * * Can be interrupted by a fatal signal. */ BTRFS_RESERVE_FLUSH_ALL_STEAL, /* * This is for btrfs_use_block_rsv only. We have exhausted our block * rsv and our global block rsv. This can happen for things like * delalloc where we are overwriting a lot of extents with a single * extent and didn't reserve enough space. Alternatively it can happen * with delalloc where we reserve 1 extents worth for a large extent but * fragmentation leads to multiple extents being created. This will * give us the reservation in the case of * * if (num_bytes < (space_info->total_bytes - * btrfs_space_info_used(space_info, false)) * * Which ignores bytes_may_use. This is potentially dangerous, but our * reservation system is generally pessimistic so is able to absorb this * style of mistake. */ BTRFS_RESERVE_FLUSH_EMERGENCY, }; /* * Please be aware that the order of enum values will be the order of the reclaim * process in btrfs_async_reclaim_metadata_space(). */ enum btrfs_flush_state { FLUSH_DELAYED_ITEMS_NR = 1, FLUSH_DELAYED_ITEMS = 2, FLUSH_DELAYED_REFS_NR = 3, FLUSH_DELAYED_REFS = 4, FLUSH_DELALLOC = 5, FLUSH_DELALLOC_WAIT = 6, FLUSH_DELALLOC_FULL = 7, ALLOC_CHUNK = 8, ALLOC_CHUNK_FORCE = 9, RUN_DELAYED_IPUTS = 10, COMMIT_TRANS = 11, RESET_ZONES = 12, }; enum btrfs_space_info_sub_group { BTRFS_SUB_GROUP_PRIMARY, BTRFS_SUB_GROUP_DATA_RELOC, BTRFS_SUB_GROUP_TREELOG, }; #define BTRFS_SPACE_INFO_SUB_GROUP_MAX 1 struct btrfs_space_info { struct btrfs_fs_info *fs_info; struct btrfs_space_info *parent; struct btrfs_space_info *sub_group[BTRFS_SPACE_INFO_SUB_GROUP_MAX]; int subgroup_id; spinlock_t lock; u64 total_bytes; /* total bytes in the space, this doesn't take mirrors into account */ u64 bytes_used; /* total bytes used, this doesn't take mirrors into account */ u64 bytes_pinned; /* total bytes pinned, will be freed when the transaction finishes */ u64 bytes_reserved; /* total bytes the allocator has reserved for current allocations */ u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ u64 bytes_zone_unusable; /* total bytes that are unusable until resetting the device zone */ u64 max_extent_size; /* This will hold the maximum extent size of the space info if we had an ENOSPC in the allocator. */ /* Chunk size in bytes */ u64 chunk_size; /* * Once a block group drops below this threshold (percents) we'll * schedule it for reclaim. */ int bg_reclaim_threshold; int clamp; /* Used to scale our threshold for preemptive flushing. The value is >> clamp, so turns out to be a 2^clamp divisor. */ unsigned int full:1; /* indicates that we cannot allocate any more chunks for this space */ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ unsigned int flush:1; /* set if we are trying to make space */ unsigned int force_alloc; /* set if we need to force a chunk alloc for this space */ u64 disk_used; /* total bytes used on disk */ u64 disk_total; /* total bytes on disk, takes mirrors into account */ u64 flags; struct list_head list; /* Protected by the spinlock 'lock'. */ struct list_head ro_bgs; struct list_head priority_tickets; struct list_head tickets; /* * Size of space that needs to be reclaimed in order to satisfy pending * tickets */ u64 reclaim_size; /* * tickets_id just indicates the next ticket will be handled, so note * it's not stored per ticket. */ u64 tickets_id; struct rw_semaphore groups_sem; /* for block groups in our same type */ struct list_head block_groups[BTRFS_NR_RAID_TYPES]; struct kobject kobj; struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; /* * Monotonically increasing counter of block group reclaim attempts * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_count */ u64 reclaim_count; /* * Monotonically increasing counter of reclaimed bytes * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_bytes */ u64 reclaim_bytes; /* * Monotonically increasing counter of reclaim errors * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_errors */ u64 reclaim_errors; /* * If true, use the dynamic relocation threshold, instead of the * fixed bg_reclaim_threshold. */ bool dynamic_reclaim; /* * Periodically check all block groups against the reclaim * threshold in the cleaner thread. */ bool periodic_reclaim; /* * Periodic reclaim should be a no-op if a space_info hasn't * freed any space since the last time we tried. */ bool periodic_reclaim_ready; /* * Net bytes freed or allocated since the last reclaim pass. */ s64 reclaimable_bytes; }; struct reserve_ticket { u64 bytes; int error; bool steal; struct list_head list; wait_queue_head_t wait; }; static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_info) { return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); } /* * * Declare a helper function to detect underflow of various space info members */ #define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \ static inline void \ btrfs_space_info_update_##name(struct btrfs_space_info *sinfo, \ s64 bytes) \ { \ struct btrfs_fs_info *fs_info = sinfo->fs_info; \ const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \ lockdep_assert_held(&sinfo->lock); \ trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ trace_btrfs_space_reservation(fs_info, trace_name, \ sinfo->flags, abs_bytes, \ bytes > 0); \ if (bytes < 0 && sinfo->name < -bytes) { \ WARN_ON(1); \ sinfo->name = 0; \ return; \ } \ sinfo->name += bytes; \ } DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info"); DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); DECLARE_SPACE_INFO_UPDATE(bytes_zone_unusable, "zone_unusable"); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, struct btrfs_block_group *block_group); void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, bool dump_block_groups); int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info); int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( struct btrfs_space_info *space_info, u64 num_bytes) { spin_lock(&space_info->lock); btrfs_space_info_update_bytes_may_use(space_info, -num_bytes); btrfs_try_granting_tickets(space_info->fs_info, space_info); spin_unlock(&space_info->lock); } int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush); void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes); void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready); int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info); void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len); #endif /* BTRFS_SPACE_INFO_H */
3 3 3 2 1 3 3 7 7 7 7 7 1 1 1 1 1 1 49 49 49 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 // SPDX-License-Identifier: GPL-2.0-only /* * VHT handling * * Portions of this file * Copyright(c) 2015 - 2016 Intel Deutschland GmbH * Copyright (C) 2018 - 2024 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/export.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "rate.h" static void __check_vhtcap_disable(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap, u32 flag) { __le32 le_flag = cpu_to_le32(flag); if (sdata->u.mgd.vht_capa_mask.vht_cap_info & le_flag && !(sdata->u.mgd.vht_capa.vht_cap_info & le_flag)) vht_cap->cap &= ~flag; } void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap) { int i; u16 rxmcs_mask, rxmcs_cap, rxmcs_n, txmcs_mask, txmcs_cap, txmcs_n; if (!vht_cap->vht_supported) return; if (sdata->vif.type != NL80211_IFTYPE_STATION) return; __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_RXLDPC); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SHORT_GI_80); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SHORT_GI_160); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_TXSTBC); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN); /* Allow user to decrease AMPDU length exponent */ if (sdata->u.mgd.vht_capa_mask.vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK)) { u32 cap, n; n = le32_to_cpu(sdata->u.mgd.vht_capa.vht_cap_info) & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; n >>= IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; cap = vht_cap->cap & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; cap >>= IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; if (n < cap) { vht_cap->cap &= ~IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; vht_cap->cap |= n << IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; } } /* Allow the user to decrease MCSes */ rxmcs_mask = le16_to_cpu(sdata->u.mgd.vht_capa_mask.supp_mcs.rx_mcs_map); rxmcs_n = le16_to_cpu(sdata->u.mgd.vht_capa.supp_mcs.rx_mcs_map); rxmcs_n &= rxmcs_mask; rxmcs_cap = le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map); txmcs_mask = le16_to_cpu(sdata->u.mgd.vht_capa_mask.supp_mcs.tx_mcs_map); txmcs_n = le16_to_cpu(sdata->u.mgd.vht_capa.supp_mcs.tx_mcs_map); txmcs_n &= txmcs_mask; txmcs_cap = le16_to_cpu(vht_cap->vht_mcs.tx_mcs_map); for (i = 0; i < 8; i++) { u8 m, n, c; m = (rxmcs_mask >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; n = (rxmcs_n >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; c = (rxmcs_cap >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; if (m && ((c != IEEE80211_VHT_MCS_NOT_SUPPORTED && n < c) || n == IEEE80211_VHT_MCS_NOT_SUPPORTED)) { rxmcs_cap &= ~(3 << 2*i); rxmcs_cap |= (rxmcs_n & (3 << 2*i)); } m = (txmcs_mask >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; n = (txmcs_n >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; c = (txmcs_cap >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; if (m && ((c != IEEE80211_VHT_MCS_NOT_SUPPORTED && n < c) || n == IEEE80211_VHT_MCS_NOT_SUPPORTED)) { txmcs_cap &= ~(3 << 2*i); txmcs_cap |= (txmcs_n & (3 << 2*i)); } } vht_cap->vht_mcs.rx_mcs_map = cpu_to_le16(rxmcs_cap); vht_cap->vht_mcs.tx_mcs_map = cpu_to_le16(txmcs_cap); } void ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_vht_cap *vht_cap_ie, const struct ieee80211_vht_cap *vht_cap_ie2, struct link_sta_info *link_sta) { struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap; struct ieee80211_sta_vht_cap own_cap; u32 cap_info, i; bool have_80mhz; u32 mpdu_len; memset(vht_cap, 0, sizeof(*vht_cap)); if (!link_sta->pub->ht_cap.ht_supported) return; if (!vht_cap_ie || !sband->vht_cap.vht_supported) return; /* Allow VHT if at least one channel on the sband supports 80 MHz */ have_80mhz = false; for (i = 0; i < sband->n_channels; i++) { if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_NO_80MHZ)) continue; have_80mhz = true; break; } if (!have_80mhz) return; /* * A VHT STA must support 40 MHz, but if we verify that here * then we break a few things - some APs (e.g. Netgear R6300v2 * and others based on the BCM4360 chipset) will unset this * capability bit when operating in 20 MHz. */ vht_cap->vht_supported = true; own_cap = sband->vht_cap; /* * If user has specified capability overrides, take care * of that if the station we're setting up is the AP that * we advertised a restricted capability set to. Override * our own capabilities and then use those below. */ if (sdata->vif.type == NL80211_IFTYPE_STATION && !test_sta_flag(link_sta->sta, WLAN_STA_TDLS_PEER)) ieee80211_apply_vhtcap_overrides(sdata, &own_cap); /* take some capabilities as-is */ cap_info = le32_to_cpu(vht_cap_ie->vht_cap_info); vht_cap->cap = cap_info; vht_cap->cap &= IEEE80211_VHT_CAP_RXLDPC | IEEE80211_VHT_CAP_VHT_TXOP_PS | IEEE80211_VHT_CAP_HTC_VHT | IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK | IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_UNSOL_MFB | IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB | IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN | IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN; vht_cap->cap |= min_t(u32, cap_info & IEEE80211_VHT_CAP_MAX_MPDU_MASK, own_cap.cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK); /* and some based on our own capabilities */ switch (own_cap.cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ; break; case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; break; default: /* nothing */ break; } /* symmetric capabilities */ vht_cap->cap |= cap_info & own_cap.cap & (IEEE80211_VHT_CAP_SHORT_GI_80 | IEEE80211_VHT_CAP_SHORT_GI_160); /* remaining ones */ if (own_cap.cap & IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE) vht_cap->cap |= cap_info & (IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE | IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK); if (own_cap.cap & IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE) vht_cap->cap |= cap_info & (IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE | IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK); if (own_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; if (own_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE; if (own_cap.cap & IEEE80211_VHT_CAP_TXSTBC) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_RXSTBC_MASK; if (own_cap.cap & IEEE80211_VHT_CAP_RXSTBC_MASK) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_TXSTBC; /* Copy peer MCS info, the driver might need them. */ memcpy(&vht_cap->vht_mcs, &vht_cap_ie->supp_mcs, sizeof(struct ieee80211_vht_mcs_info)); /* copy EXT_NSS_BW Support value or remove the capability */ if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_VHT_EXT_NSS_BW)) vht_cap->cap |= (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); else vht_cap->vht_mcs.tx_highest &= ~cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE); /* but also restrict MCSes */ for (i = 0; i < 8; i++) { u16 own_rx, own_tx, peer_rx, peer_tx; own_rx = le16_to_cpu(own_cap.vht_mcs.rx_mcs_map); own_rx = (own_rx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; own_tx = le16_to_cpu(own_cap.vht_mcs.tx_mcs_map); own_tx = (own_tx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; peer_rx = le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map); peer_rx = (peer_rx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; peer_tx = le16_to_cpu(vht_cap->vht_mcs.tx_mcs_map); peer_tx = (peer_tx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; if (peer_tx != IEEE80211_VHT_MCS_NOT_SUPPORTED) { if (own_rx == IEEE80211_VHT_MCS_NOT_SUPPORTED) peer_tx = IEEE80211_VHT_MCS_NOT_SUPPORTED; else if (own_rx < peer_tx) peer_tx = own_rx; } if (peer_rx != IEEE80211_VHT_MCS_NOT_SUPPORTED) { if (own_tx == IEEE80211_VHT_MCS_NOT_SUPPORTED) peer_rx = IEEE80211_VHT_MCS_NOT_SUPPORTED; else if (own_tx < peer_rx) peer_rx = own_tx; } vht_cap->vht_mcs.rx_mcs_map &= ~cpu_to_le16(IEEE80211_VHT_MCS_NOT_SUPPORTED << i * 2); vht_cap->vht_mcs.rx_mcs_map |= cpu_to_le16(peer_rx << i * 2); vht_cap->vht_mcs.tx_mcs_map &= ~cpu_to_le16(IEEE80211_VHT_MCS_NOT_SUPPORTED << i * 2); vht_cap->vht_mcs.tx_mcs_map |= cpu_to_le16(peer_tx << i * 2); } /* * This is a workaround for VHT-enabled STAs which break the spec * and have the VHT-MCS Rx map filled in with value 3 for all eight * spatial streams, an example is AR9462. * * As per spec, in section 22.1.1 Introduction to the VHT PHY * A VHT STA shall support at least single spatial stream VHT-MCSs * 0 to 7 (transmit and receive) in all supported channel widths. */ if (vht_cap->vht_mcs.rx_mcs_map == cpu_to_le16(0xFFFF)) { vht_cap->vht_supported = false; sdata_info(sdata, "Ignoring VHT IE from %pM (link:%pM) due to invalid rx_mcs_map\n", link_sta->sta->addr, link_sta->addr); return; } /* finally set up the bandwidth */ switch (vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; break; default: link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; if (!(vht_cap->vht_mcs.tx_highest & cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE))) break; /* * If this is non-zero, then it does support 160 MHz after all, * in one form or the other. We don't distinguish here (or even * above) between 160 and 80+80 yet. */ if (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; } link_sta->pub->bandwidth = ieee80211_sta_cur_vht_bw(link_sta); /* * Work around the Cisco 9115 FW 17.3 bug by taking the min of * both reported MPDU lengths. */ mpdu_len = vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK; if (vht_cap_ie2) mpdu_len = min_t(u32, mpdu_len, le32_get_bits(vht_cap_ie2->vht_cap_info, IEEE80211_VHT_CAP_MAX_MPDU_MASK)); /* * FIXME - should the amsdu len be per link? store per link * and maintain a minimum? */ switch (mpdu_len) { case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: link_sta->pub->agg.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454; break; case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991: link_sta->pub->agg.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_7991; break; case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895: default: link_sta->pub->agg.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_3895; break; } ieee80211_sta_recalc_aggregates(&link_sta->sta->sta); } /* FIXME: move this to some better location - parses HE/EHT now */ static enum ieee80211_sta_rx_bandwidth __ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta, struct cfg80211_chan_def *chandef) { unsigned int link_id = link_sta->link_id; struct ieee80211_sub_if_data *sdata = link_sta->sta->sdata; struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap; struct ieee80211_sta_he_cap *he_cap = &link_sta->pub->he_cap; struct ieee80211_sta_eht_cap *eht_cap = &link_sta->pub->eht_cap; u32 cap_width; if (he_cap->has_he) { enum nl80211_band band; u8 info; if (chandef) { band = chandef->chan->band; } else { struct ieee80211_bss_conf *link_conf; rcu_read_lock(); link_conf = rcu_dereference(sdata->vif.link_conf[link_id]); band = link_conf->chanreq.oper.chan->band; rcu_read_unlock(); } if (eht_cap->has_eht && band == NL80211_BAND_6GHZ) { info = eht_cap->eht_cap_elem.phy_cap_info[0]; if (info & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) return IEEE80211_STA_RX_BW_320; } info = he_cap->he_cap_elem.phy_cap_info[0]; if (band == NL80211_BAND_2GHZ) { if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) return IEEE80211_STA_RX_BW_40; return IEEE80211_STA_RX_BW_20; } if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G || info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) return IEEE80211_STA_RX_BW_160; if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G) return IEEE80211_STA_RX_BW_80; return IEEE80211_STA_RX_BW_20; } if (!vht_cap->vht_supported) return link_sta->pub->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20; cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ || cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return IEEE80211_STA_RX_BW_160; /* * If this is non-zero, then it does support 160 MHz after all, * in one form or the other. We don't distinguish here (or even * above) between 160 and 80+80 yet. */ if (vht_cap->cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) return IEEE80211_STA_RX_BW_160; return IEEE80211_STA_RX_BW_80; } enum ieee80211_sta_rx_bandwidth _ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta, struct cfg80211_chan_def *chandef) { /* * With RX OMI, also pretend that the STA's capability changed. * Of course this isn't really true, it didn't change, only our * RX capability was changed by notifying RX OMI to the STA. * The purpose, however, is to save power, and that requires * changing also transmissions to the AP and the chanctx. The * transmissions depend on link_sta->bandwidth which is set in * _ieee80211_sta_cur_vht_bw() below, but the chanctx depends * on the result of this function which is also called by * _ieee80211_sta_cur_vht_bw(), so we need to do that here as * well. This is sufficient for the steady state, but during * the transition we already need to change TX/RX separately, * so _ieee80211_sta_cur_vht_bw() below applies the _tx one. */ return min(__ieee80211_sta_cap_rx_bw(link_sta, chandef), link_sta->rx_omi_bw_rx); } enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct link_sta_info *link_sta) { struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap; u32 cap_width; if (!vht_cap->vht_supported) { if (!link_sta->pub->ht_cap.ht_supported) return NL80211_CHAN_WIDTH_20_NOHT; return link_sta->pub->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? NL80211_CHAN_WIDTH_40 : NL80211_CHAN_WIDTH_20; } cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ) return NL80211_CHAN_WIDTH_160; else if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return NL80211_CHAN_WIDTH_80P80; return NL80211_CHAN_WIDTH_80; } enum nl80211_chan_width ieee80211_sta_rx_bw_to_chan_width(struct link_sta_info *link_sta) { enum ieee80211_sta_rx_bandwidth cur_bw = link_sta->pub->bandwidth; struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap; u32 cap_width; switch (cur_bw) { case IEEE80211_STA_RX_BW_20: if (!link_sta->pub->ht_cap.ht_supported) return NL80211_CHAN_WIDTH_20_NOHT; else return NL80211_CHAN_WIDTH_20; case IEEE80211_STA_RX_BW_40: return NL80211_CHAN_WIDTH_40; case IEEE80211_STA_RX_BW_80: return NL80211_CHAN_WIDTH_80; case IEEE80211_STA_RX_BW_160: cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ) return NL80211_CHAN_WIDTH_160; return NL80211_CHAN_WIDTH_80P80; default: return NL80211_CHAN_WIDTH_20; } } /* FIXME: rename/move - this deals with everything not just VHT */ enum ieee80211_sta_rx_bandwidth _ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta, struct cfg80211_chan_def *chandef) { struct sta_info *sta = link_sta->sta; enum nl80211_chan_width bss_width; enum ieee80211_sta_rx_bandwidth bw; if (chandef) { bss_width = chandef->width; } else { struct ieee80211_bss_conf *link_conf; rcu_read_lock(); link_conf = rcu_dereference(sta->sdata->vif.link_conf[link_sta->link_id]); if (WARN_ON_ONCE(!link_conf)) { rcu_read_unlock(); return IEEE80211_STA_RX_BW_20; } bss_width = link_conf->chanreq.oper.width; rcu_read_unlock(); } /* intentionally do not take rx_bw_omi_rx into account */ bw = __ieee80211_sta_cap_rx_bw(link_sta, chandef); bw = min(bw, link_sta->cur_max_bandwidth); /* but do apply rx_omi_bw_tx */ bw = min(bw, link_sta->rx_omi_bw_tx); /* Don't consider AP's bandwidth for TDLS peers, section 11.23.1 of * IEEE80211-2016 specification makes higher bandwidth operation * possible on the TDLS link if the peers have wider bandwidth * capability. * * However, in this case, and only if the TDLS peer is authorized, * limit to the tdls_chandef so that the configuration here isn't * wider than what's actually requested on the channel context. */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW) && test_sta_flag(sta, WLAN_STA_AUTHORIZED) && sta->tdls_chandef.chan) bw = min(bw, ieee80211_chan_width_to_rx_bw(sta->tdls_chandef.width)); else bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width)); return bw; } void ieee80211_sta_init_nss(struct link_sta_info *link_sta) { u8 ht_rx_nss = 0, vht_rx_nss = 0, he_rx_nss = 0, eht_rx_nss = 0, rx_nss; bool support_160; if (link_sta->pub->eht_cap.has_eht) { int i; const u8 *rx_nss_mcs = (void *)&link_sta->pub->eht_cap.eht_mcs_nss_supp; /* get the max nss for EHT over all possible bandwidths and mcs */ for (i = 0; i < sizeof(struct ieee80211_eht_mcs_nss_supp); i++) eht_rx_nss = max_t(u8, eht_rx_nss, u8_get_bits(rx_nss_mcs[i], IEEE80211_EHT_MCS_NSS_RX)); } if (link_sta->pub->he_cap.has_he) { int i; u8 rx_mcs_80 = 0, rx_mcs_160 = 0; const struct ieee80211_sta_he_cap *he_cap = &link_sta->pub->he_cap; u16 mcs_160_map = le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_160); u16 mcs_80_map = le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_80); for (i = 7; i >= 0; i--) { u8 mcs_160 = (mcs_160_map >> (2 * i)) & 3; if (mcs_160 != IEEE80211_HE_MCS_NOT_SUPPORTED) { rx_mcs_160 = i + 1; break; } } for (i = 7; i >= 0; i--) { u8 mcs_80 = (mcs_80_map >> (2 * i)) & 3; if (mcs_80 != IEEE80211_HE_MCS_NOT_SUPPORTED) { rx_mcs_80 = i + 1; break; } } support_160 = he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; if (support_160) he_rx_nss = min(rx_mcs_80, rx_mcs_160); else he_rx_nss = rx_mcs_80; } if (link_sta->pub->ht_cap.ht_supported) { if (link_sta->pub->ht_cap.mcs.rx_mask[0]) ht_rx_nss++; if (link_sta->pub->ht_cap.mcs.rx_mask[1]) ht_rx_nss++; if (link_sta->pub->ht_cap.mcs.rx_mask[2]) ht_rx_nss++; if (link_sta->pub->ht_cap.mcs.rx_mask[3]) ht_rx_nss++; /* FIXME: consider rx_highest? */ } if (link_sta->pub->vht_cap.vht_supported) { int i; u16 rx_mcs_map; rx_mcs_map = le16_to_cpu(link_sta->pub->vht_cap.vht_mcs.rx_mcs_map); for (i = 7; i >= 0; i--) { u8 mcs = (rx_mcs_map >> (2 * i)) & 3; if (mcs != IEEE80211_VHT_MCS_NOT_SUPPORTED) { vht_rx_nss = i + 1; break; } } /* FIXME: consider rx_highest? */ } rx_nss = max(vht_rx_nss, ht_rx_nss); rx_nss = max(he_rx_nss, rx_nss); rx_nss = max(eht_rx_nss, rx_nss); rx_nss = max_t(u8, 1, rx_nss); link_sta->capa_nss = rx_nss; /* that shouldn't be set yet, but we can handle it anyway */ if (link_sta->op_mode_nss) link_sta->pub->rx_nss = min_t(u8, rx_nss, link_sta->op_mode_nss); else link_sta->pub->rx_nss = rx_nss; } u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct link_sta_info *link_sta, u8 opmode, enum nl80211_band band) { enum ieee80211_sta_rx_bandwidth new_bw; struct sta_opmode_info sta_opmode = {}; u32 changed = 0; u8 nss; /* ignore - no support for BF yet */ if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF) return 0; nss = opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; nss += 1; if (link_sta->op_mode_nss != nss) { if (nss <= link_sta->capa_nss) { link_sta->op_mode_nss = nss; if (nss != link_sta->pub->rx_nss) { link_sta->pub->rx_nss = nss; changed |= IEEE80211_RC_NSS_CHANGED; sta_opmode.rx_nss = link_sta->pub->rx_nss; sta_opmode.changed |= STA_OPMODE_N_SS_CHANGED; } } else { sdata_dbg(sdata, "Ignore NSS change to invalid %d in VHT opmode notif from %pM", nss, link_sta->pub->addr); } } switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_40; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: if (opmode & IEEE80211_OPMODE_NOTIF_BW_160_80P80) link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; else link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: /* legacy only, no longer used by newer spec */ link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; break; } new_bw = ieee80211_sta_cur_vht_bw(link_sta); if (new_bw != link_sta->pub->bandwidth) { link_sta->pub->bandwidth = new_bw; sta_opmode.bw = ieee80211_sta_rx_bw_to_chan_width(link_sta); changed |= IEEE80211_RC_BW_CHANGED; sta_opmode.changed |= STA_OPMODE_MAX_BW_CHANGED; } if (sta_opmode.changed) cfg80211_sta_opmode_change_notify(sdata->dev, link_sta->addr, &sta_opmode, GFP_KERNEL); return changed; } void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct ieee80211_mgmt *mgmt) { struct ieee80211_bss_conf *link_conf = link->conf; if (!link_conf->mu_mimo_owner) return; if (!memcmp(mgmt->u.action.u.vht_group_notif.position, link_conf->mu_group.position, WLAN_USER_POSITION_LEN) && !memcmp(mgmt->u.action.u.vht_group_notif.membership, link_conf->mu_group.membership, WLAN_MEMBERSHIP_LEN)) return; memcpy(link_conf->mu_group.membership, mgmt->u.action.u.vht_group_notif.membership, WLAN_MEMBERSHIP_LEN); memcpy(link_conf->mu_group.position, mgmt->u.action.u.vht_group_notif.position, WLAN_USER_POSITION_LEN); ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_MU_GROUPS); } void ieee80211_update_mu_groups(struct ieee80211_vif *vif, unsigned int link_id, const u8 *membership, const u8 *position) { struct ieee80211_bss_conf *link_conf; rcu_read_lock(); link_conf = rcu_dereference(vif->link_conf[link_id]); if (!WARN_ON_ONCE(!link_conf || !link_conf->mu_mimo_owner)) { memcpy(link_conf->mu_group.membership, membership, WLAN_MEMBERSHIP_LEN); memcpy(link_conf->mu_group.position, position, WLAN_USER_POSITION_LEN); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_update_mu_groups); void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct link_sta_info *link_sta, u8 opmode, enum nl80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; u32 changed = __ieee80211_vht_handle_opmode(sdata, link_sta, opmode, band); if (changed > 0) { ieee80211_recalc_min_chandef(sdata, link_sta->link_id); rate_control_rate_update(local, sband, link_sta, changed); } } void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, u16 vht_mask[NL80211_VHT_NSS_MAX]) { int i; u16 mask, cap = le16_to_cpu(vht_cap); for (i = 0; i < NL80211_VHT_NSS_MAX; i++) { mask = (cap >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; switch (mask) { case IEEE80211_VHT_MCS_SUPPORT_0_7: vht_mask[i] = 0x00FF; break; case IEEE80211_VHT_MCS_SUPPORT_0_8: vht_mask[i] = 0x01FF; break; case IEEE80211_VHT_MCS_SUPPORT_0_9: vht_mask[i] = 0x03FF; break; case IEEE80211_VHT_MCS_NOT_SUPPORTED: default: vht_mask[i] = 0; break; } } }
34 27 26 10 10 62 63 112 113 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS compatible sequencer driver * * registration of device and proc * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> */ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/compat.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/initval.h> #include "seq_oss_device.h" #include "seq_oss_synth.h" /* * module option */ MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>"); MODULE_DESCRIPTION("OSS-compatible sequencer module"); MODULE_LICENSE("GPL"); /* Takashi says this is really only for sound-service-0-, but this is OK. */ MODULE_ALIAS_SNDRV_MINOR(SNDRV_MINOR_OSS_SEQUENCER); MODULE_ALIAS_SNDRV_MINOR(SNDRV_MINOR_OSS_MUSIC); /* * prototypes */ static int register_device(void); static void unregister_device(void); #ifdef CONFIG_SND_PROC_FS static int register_proc(void); static void unregister_proc(void); #else static inline int register_proc(void) { return 0; } static inline void unregister_proc(void) {} #endif static int odev_open(struct inode *inode, struct file *file); static int odev_release(struct inode *inode, struct file *file); static ssize_t odev_read(struct file *file, char __user *buf, size_t count, loff_t *offset); static ssize_t odev_write(struct file *file, const char __user *buf, size_t count, loff_t *offset); static long odev_ioctl(struct file *file, unsigned int cmd, unsigned long arg); static __poll_t odev_poll(struct file *file, poll_table * wait); /* * module interface */ static struct snd_seq_driver seq_oss_synth_driver = { .driver = { .name = KBUILD_MODNAME, .probe = snd_seq_oss_synth_probe, .remove = snd_seq_oss_synth_remove, }, .id = SNDRV_SEQ_DEV_ID_OSS, .argsize = sizeof(struct snd_seq_oss_reg), }; static int __init alsa_seq_oss_init(void) { int rc; rc = register_device(); if (rc < 0) goto error; rc = register_proc(); if (rc < 0) { unregister_device(); goto error; } rc = snd_seq_oss_create_client(); if (rc < 0) { unregister_proc(); unregister_device(); goto error; } rc = snd_seq_driver_register(&seq_oss_synth_driver); if (rc < 0) { snd_seq_oss_delete_client(); unregister_proc(); unregister_device(); goto error; } /* success */ snd_seq_oss_synth_init(); error: return rc; } static void __exit alsa_seq_oss_exit(void) { snd_seq_driver_unregister(&seq_oss_synth_driver); snd_seq_oss_delete_client(); unregister_proc(); unregister_device(); } module_init(alsa_seq_oss_init) module_exit(alsa_seq_oss_exit) /* * ALSA minor device interface */ static DEFINE_MUTEX(register_mutex); static int odev_open(struct inode *inode, struct file *file) { int level, rc; if (iminor(inode) == SNDRV_MINOR_OSS_MUSIC) level = SNDRV_SEQ_OSS_MODE_MUSIC; else level = SNDRV_SEQ_OSS_MODE_SYNTH; mutex_lock(&register_mutex); rc = snd_seq_oss_open(file, level); mutex_unlock(&register_mutex); return rc; } static int odev_release(struct inode *inode, struct file *file) { struct seq_oss_devinfo *dp; dp = file->private_data; if (!dp) return 0; mutex_lock(&register_mutex); snd_seq_oss_release(dp); mutex_unlock(&register_mutex); return 0; } static ssize_t odev_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { struct seq_oss_devinfo *dp; dp = file->private_data; if (snd_BUG_ON(!dp)) return -ENXIO; return snd_seq_oss_read(dp, buf, count); } static ssize_t odev_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct seq_oss_devinfo *dp; dp = file->private_data; if (snd_BUG_ON(!dp)) return -ENXIO; return snd_seq_oss_write(dp, buf, count, file); } static long odev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct seq_oss_devinfo *dp; long rc; dp = file->private_data; if (snd_BUG_ON(!dp)) return -ENXIO; if (cmd != SNDCTL_SEQ_SYNC && mutex_lock_interruptible(&register_mutex)) return -ERESTARTSYS; rc = snd_seq_oss_ioctl(dp, cmd, arg); if (cmd != SNDCTL_SEQ_SYNC) mutex_unlock(&register_mutex); return rc; } #ifdef CONFIG_COMPAT static long odev_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { return odev_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); } #else #define odev_ioctl_compat NULL #endif static __poll_t odev_poll(struct file *file, poll_table * wait) { struct seq_oss_devinfo *dp; dp = file->private_data; if (snd_BUG_ON(!dp)) return EPOLLERR; return snd_seq_oss_poll(dp, file, wait); } /* * registration of sequencer minor device */ static const struct file_operations seq_oss_f_ops = { .owner = THIS_MODULE, .read = odev_read, .write = odev_write, .open = odev_open, .release = odev_release, .poll = odev_poll, .unlocked_ioctl = odev_ioctl, .compat_ioctl = odev_ioctl_compat, .llseek = noop_llseek, }; static int __init register_device(void) { int rc; mutex_lock(&register_mutex); rc = snd_register_oss_device(SNDRV_OSS_DEVICE_TYPE_SEQUENCER, NULL, 0, &seq_oss_f_ops, NULL); if (rc < 0) { pr_err("ALSA: seq_oss: can't register device seq\n"); mutex_unlock(&register_mutex); return rc; } rc = snd_register_oss_device(SNDRV_OSS_DEVICE_TYPE_MUSIC, NULL, 0, &seq_oss_f_ops, NULL); if (rc < 0) { pr_err("ALSA: seq_oss: can't register device music\n"); snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_SEQUENCER, NULL, 0); mutex_unlock(&register_mutex); return rc; } mutex_unlock(&register_mutex); return 0; } static void unregister_device(void) { mutex_lock(&register_mutex); if (snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_MUSIC, NULL, 0) < 0) pr_err("ALSA: seq_oss: error unregister device music\n"); if (snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_SEQUENCER, NULL, 0) < 0) pr_err("ALSA: seq_oss: error unregister device seq\n"); mutex_unlock(&register_mutex); } /* * /proc interface */ #ifdef CONFIG_SND_PROC_FS static struct snd_info_entry *info_entry; static void info_read(struct snd_info_entry *entry, struct snd_info_buffer *buf) { mutex_lock(&register_mutex); snd_iprintf(buf, "OSS sequencer emulation version %s\n", SNDRV_SEQ_OSS_VERSION_STR); snd_seq_oss_system_info_read(buf); snd_seq_oss_synth_info_read(buf); snd_seq_oss_midi_info_read(buf); mutex_unlock(&register_mutex); } static int __init register_proc(void) { struct snd_info_entry *entry; entry = snd_info_create_module_entry(THIS_MODULE, SNDRV_SEQ_OSS_PROCNAME, snd_seq_root); if (entry == NULL) return -ENOMEM; entry->content = SNDRV_INFO_CONTENT_TEXT; entry->private_data = NULL; entry->c.text.read = info_read; if (snd_info_register(entry) < 0) { snd_info_free_entry(entry); return -ENOMEM; } info_entry = entry; return 0; } static void unregister_proc(void) { snd_info_free_entry(info_entry); info_entry = NULL; } #endif /* CONFIG_SND_PROC_FS */
393 391 3 395 2 394 28 28 306 302 4 301 5 305 296 8 8 306 2 302 290 16 4 301 304 2 298 8 8 76 75 75 72 4 76 69 7 7 76 76 1 75 65 11 3 73 75 1 69 7 7 73 73 73 71 71 71 2 2 2 2 2 264 1 262 85 85 83 3 84 1 12 85 10 10 10 3 10 10 10 11 11 11 9 1 19 41 20 21 21 12 8 17 12 5 3 27 27 11 11 11 28 28 10 10 9 3 3 21 21 21 99 411 2 408 4 1 395 424 425 396 187 277 276 427 203 424 397 263 262 261 92 28 90 2 39 90 92 1 91 76 76 75 72 72 1 71 72 72 12 5 7 11 4 7 78 12 74 7 7 7 78 78 78 78 77 77 77 77 77 77 77 1 73 4 4 73 21 77 77 13 21 21 54 22 76 75 61 72 75 1 11 74 7 7 7 335 335 334 5 5 5 10 8 8 1 5 2 1 5 2 2 2 6 6 1 1 8 8 5 1 3 8 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2002 Intel Corp. * * This file is part of the SCTP kernel implementation * * These functions work with the state functions in sctp_sm_statefuns.c * to implement the state operations. These functions implement the * steps which require modifying existing data structures. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * C. Robin <chris@hundredacre.ac.uk> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * Dajiang Zhang <dajiang.zhang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Kevin Gao <kevin.gao@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/hash.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/skbuff.h> #include <linux/random.h> /* for get_random_bytes */ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp); static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp); static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp); static struct sctp_cookie_param *sctp_pack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, const struct sctp_chunk *init_chunk, int *cookie_len, const __u8 *raw_addrs, int addrs_len); static int sctp_process_param(struct sctp_association *asoc, union sctp_params param, const union sctp_addr *peer_addr, gfp_t gfp); static void *sctp_addto_param(struct sctp_chunk *chunk, int len, const void *data); /* Control chunk destructor */ static void sctp_control_release_owner(struct sk_buff *skb) { struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg; if (chunk->shkey) { struct sctp_shared_key *shkey = chunk->shkey; struct sctp_association *asoc = chunk->asoc; /* refcnt == 2 and !list_empty mean after this release, it's * not being used anywhere, and it's time to notify userland * that this shkey can be freed if it's been deactivated. */ if (shkey->deactivated && !list_empty(&shkey->key_list) && refcount_read(&shkey->refcnt) == 2) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id, SCTP_AUTH_FREE_KEY, GFP_KERNEL); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_auth_shkey_release(chunk->shkey); } } static void sctp_control_set_owner_w(struct sctp_chunk *chunk) { struct sctp_association *asoc = chunk->asoc; struct sk_buff *skb = chunk->skb; /* TODO: properly account for control chunks. * To do it right we'll need: * 1) endpoint if association isn't known. * 2) proper memory accounting. * * For now don't do anything for now. */ if (chunk->auth) { chunk->shkey = asoc->shkey; sctp_auth_shkey_hold(chunk->shkey); } skb->sk = asoc ? asoc->base.sk : NULL; skb_shinfo(skb)->destructor_arg = chunk; skb->destructor = sctp_control_release_owner; } /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 2: The ECN capable field is reserved for future use of * Explicit Congestion Notification. */ static const struct sctp_paramhdr ecap_param = { SCTP_PARAM_ECN_CAPABLE, cpu_to_be16(sizeof(struct sctp_paramhdr)), }; static const struct sctp_paramhdr prsctp_param = { SCTP_PARAM_FWD_TSN_SUPPORT, cpu_to_be16(sizeof(struct sctp_paramhdr)), }; /* A helper to initialize an op error inside a provided chunk, as most * cause codes will be embedded inside an abort chunk. */ int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code, size_t paylen) { struct sctp_errhdr err; __u16 len; /* Cause code constants are now defined in network order. */ err.cause = cause_code; len = sizeof(err) + paylen; err.length = htons(len); if (skb_tailroom(chunk->skb) < len) return -ENOSPC; chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err); return 0; } /* 3.3.2 Initiation (INIT) (1) * * This chunk is used to initiate a SCTP association between two * endpoints. The format of the INIT chunk is shown below: * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 1 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Initiate Tag | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Advertised Receiver Window Credit (a_rwnd) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Number of Outbound Streams | Number of Inbound Streams | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Initial TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Optional/Variable-Length Parameters / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * * The INIT chunk contains the following parameters. Unless otherwise * noted, each parameter MUST only be included once in the INIT chunk. * * Fixed Parameters Status * ---------------------------------------------- * Initiate Tag Mandatory * Advertised Receiver Window Credit Mandatory * Number of Outbound Streams Mandatory * Number of Inbound Streams Mandatory * Initial TSN Mandatory * * Variable Parameters Status Type Value * ------------------------------------------------------------- * IPv4 Address (Note 1) Optional 5 * IPv6 Address (Note 1) Optional 6 * Cookie Preservative Optional 9 * Reserved for ECN Capable (Note 2) Optional 32768 (0x8000) * Host Name Address (Note 3) Optional 11 * Supported Address Types (Note 4) Optional 12 */ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, const struct sctp_bind_addr *bp, gfp_t gfp, int vparam_len) { struct sctp_supported_ext_param ext_param; struct sctp_adaptation_ind_param aiparam; struct sctp_paramhdr *auth_chunks = NULL; struct sctp_paramhdr *auth_hmacs = NULL; struct sctp_supported_addrs_param sat; struct sctp_endpoint *ep = asoc->ep; struct sctp_chunk *retval = NULL; int num_types, addrs_len = 0; struct sctp_inithdr init; union sctp_params addrs; struct sctp_sock *sp; __u8 extensions[5]; size_t chunksize; __be16 types[2]; int num_ext = 0; /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 1: The INIT chunks can contain multiple addresses that * can be IPv4 and/or IPv6 in any combination. */ /* Convert the provided bind address list to raw format. */ addrs = sctp_bind_addrs_to_raw(bp, &addrs_len, gfp); init.init_tag = htonl(asoc->c.my_vtag); init.a_rwnd = htonl(asoc->rwnd); init.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); init.num_inbound_streams = htons(asoc->c.sinit_max_instreams); init.initial_tsn = htonl(asoc->c.initial_tsn); /* How many address types are needed? */ sp = sctp_sk(asoc->base.sk); num_types = sp->pf->supported_addrs(sp, types); chunksize = sizeof(init) + addrs_len; chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types)); if (asoc->ep->ecn_enable) chunksize += sizeof(ecap_param); if (asoc->ep->prsctp_enable) chunksize += sizeof(prsctp_param); /* ADDIP: Section 4.2.7: * An implementation supporting this extension [ADDIP] MUST list * the ASCONF,the ASCONF-ACK, and the AUTH chunks in its INIT and * INIT-ACK parameters. */ if (asoc->ep->asconf_enable) { extensions[num_ext] = SCTP_CID_ASCONF; extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; num_ext += 2; } if (asoc->ep->reconf_enable) { extensions[num_ext] = SCTP_CID_RECONF; num_ext += 1; } if (sp->adaptation_ind) chunksize += sizeof(aiparam); if (asoc->ep->intl_enable) { extensions[num_ext] = SCTP_CID_I_DATA; num_ext += 1; } chunksize += vparam_len; /* Account for AUTH related parameters */ if (ep->auth_enable) { /* Add random parameter length*/ chunksize += sizeof(asoc->c.auth_random); /* Add HMACS parameter length if any were defined */ auth_hmacs = (struct sctp_paramhdr *)asoc->c.auth_hmacs; if (auth_hmacs->length) chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; /* Add CHUNKS parameter length */ auth_chunks = (struct sctp_paramhdr *)asoc->c.auth_chunks; if (auth_chunks->length) chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; extensions[num_ext] = SCTP_CID_AUTH; num_ext += 1; } /* If we have any extensions to report, account for that */ if (num_ext) chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext); /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 3: An INIT chunk MUST NOT contain more than one Host * Name address parameter. Moreover, the sender of the INIT * MUST NOT combine any other address types with the Host Name * address in the INIT. The receiver of INIT MUST ignore any * other address types if the Host Name address parameter is * present in the received INIT chunk. * * PLEASE DO NOT FIXME [This version does not support Host Name.] */ retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize, gfp); if (!retval) goto nodata; retval->subh.init_hdr = sctp_addto_chunk(retval, sizeof(init), &init); retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v); /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 4: This parameter, when present, specifies all the * address types the sending endpoint can support. The absence * of this parameter indicates that the sending endpoint can * support any address type. */ sat.param_hdr.type = SCTP_PARAM_SUPPORTED_ADDRESS_TYPES; sat.param_hdr.length = htons(SCTP_SAT_LEN(num_types)); sctp_addto_chunk(retval, sizeof(sat), &sat); sctp_addto_chunk(retval, num_types * sizeof(__u16), &types); if (asoc->ep->ecn_enable) sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); /* Add the supported extensions parameter. Be nice and add this * fist before addiding the parameters for the extensions themselves */ if (num_ext) { ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext); sctp_addto_chunk(retval, sizeof(ext_param), &ext_param); sctp_addto_param(retval, num_ext, extensions); } if (asoc->ep->prsctp_enable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; aiparam.param_hdr.length = htons(sizeof(aiparam)); aiparam.adaptation_ind = htonl(sp->adaptation_ind); sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); } /* Add SCTP-AUTH chunks to the parameter list */ if (ep->auth_enable) { sctp_addto_chunk(retval, sizeof(asoc->c.auth_random), asoc->c.auth_random); if (auth_hmacs) sctp_addto_chunk(retval, ntohs(auth_hmacs->length), auth_hmacs); if (auth_chunks) sctp_addto_chunk(retval, ntohs(auth_chunks->length), auth_chunks); } nodata: kfree(addrs.v); return retval; } struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, gfp_t gfp, int unkparam_len) { struct sctp_supported_ext_param ext_param; struct sctp_adaptation_ind_param aiparam; struct sctp_paramhdr *auth_chunks = NULL; struct sctp_paramhdr *auth_random = NULL; struct sctp_paramhdr *auth_hmacs = NULL; struct sctp_chunk *retval = NULL; struct sctp_cookie_param *cookie; struct sctp_inithdr initack; union sctp_params addrs; struct sctp_sock *sp; __u8 extensions[5]; size_t chunksize; int num_ext = 0; int cookie_len; int addrs_len; /* Note: there may be no addresses to embed. */ addrs = sctp_bind_addrs_to_raw(&asoc->base.bind_addr, &addrs_len, gfp); initack.init_tag = htonl(asoc->c.my_vtag); initack.a_rwnd = htonl(asoc->rwnd); initack.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); initack.num_inbound_streams = htons(asoc->c.sinit_max_instreams); initack.initial_tsn = htonl(asoc->c.initial_tsn); /* FIXME: We really ought to build the cookie right * into the packet instead of allocating more fresh memory. */ cookie = sctp_pack_cookie(asoc->ep, asoc, chunk, &cookie_len, addrs.v, addrs_len); if (!cookie) goto nomem_cookie; /* Calculate the total size of allocation, include the reserved * space for reporting unknown parameters if it is specified. */ sp = sctp_sk(asoc->base.sk); chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len; /* Tell peer that we'll do ECN only if peer advertised such cap. */ if (asoc->peer.ecn_capable) chunksize += sizeof(ecap_param); if (asoc->peer.prsctp_capable) chunksize += sizeof(prsctp_param); if (asoc->peer.asconf_capable) { extensions[num_ext] = SCTP_CID_ASCONF; extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; num_ext += 2; } if (asoc->peer.reconf_capable) { extensions[num_ext] = SCTP_CID_RECONF; num_ext += 1; } if (sp->adaptation_ind) chunksize += sizeof(aiparam); if (asoc->peer.intl_capable) { extensions[num_ext] = SCTP_CID_I_DATA; num_ext += 1; } if (asoc->peer.auth_capable) { auth_random = (struct sctp_paramhdr *)asoc->c.auth_random; chunksize += ntohs(auth_random->length); auth_hmacs = (struct sctp_paramhdr *)asoc->c.auth_hmacs; if (auth_hmacs->length) chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; auth_chunks = (struct sctp_paramhdr *)asoc->c.auth_chunks; if (auth_chunks->length) chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; extensions[num_ext] = SCTP_CID_AUTH; num_ext += 1; } if (num_ext) chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext); /* Now allocate and fill out the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp); if (!retval) goto nomem_chunk; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it received the DATA or control chunk * to which it is replying. * * [INIT ACK back to where the INIT came from.] */ if (chunk->transport) retval->transport = sctp_assoc_lookup_paddr(asoc, &chunk->transport->ipaddr); retval->subh.init_hdr = sctp_addto_chunk(retval, sizeof(initack), &initack); retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v); sctp_addto_chunk(retval, cookie_len, cookie); if (asoc->peer.ecn_capable) sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); if (num_ext) { ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext); sctp_addto_chunk(retval, sizeof(ext_param), &ext_param); sctp_addto_param(retval, num_ext, extensions); } if (asoc->peer.prsctp_capable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; aiparam.param_hdr.length = htons(sizeof(aiparam)); aiparam.adaptation_ind = htonl(sp->adaptation_ind); sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); } if (asoc->peer.auth_capable) { sctp_addto_chunk(retval, ntohs(auth_random->length), auth_random); if (auth_hmacs) sctp_addto_chunk(retval, ntohs(auth_hmacs->length), auth_hmacs); if (auth_chunks) sctp_addto_chunk(retval, ntohs(auth_chunks->length), auth_chunks); } /* We need to remove the const qualifier at this point. */ retval->asoc = (struct sctp_association *) asoc; nomem_chunk: kfree(cookie); nomem_cookie: kfree(addrs.v); return retval; } /* 3.3.11 Cookie Echo (COOKIE ECHO) (10): * * This chunk is used only during the initialization of an association. * It is sent by the initiator of an association to its peer to complete * the initialization process. This chunk MUST precede any DATA chunk * sent within the association, but MAY be bundled with one or more DATA * chunks in the same packet. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 10 |Chunk Flags | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / Cookie / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bit * * Set to zero on transmit and ignored on receipt. * * Length: 16 bits (unsigned integer) * * Set to the size of the chunk in bytes, including the 4 bytes of * the chunk header and the size of the Cookie. * * Cookie: variable size * * This field must contain the exact cookie received in the * State Cookie parameter from the previous INIT ACK. * * An implementation SHOULD make the cookie as small as possible * to insure interoperability. */ struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; int cookie_len; void *cookie; cookie = asoc->peer.cookie; cookie_len = asoc->peer.cookie_len; /* Build a cookie echo chunk. */ retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len, GFP_ATOMIC); if (!retval) goto nodata; retval->subh.cookie_hdr = sctp_addto_chunk(retval, cookie_len, cookie); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [COOKIE ECHO back to where the INIT ACK came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* 3.3.12 Cookie Acknowledgement (COOKIE ACK) (11): * * This chunk is used only during the initialization of an * association. It is used to acknowledge the receipt of a COOKIE * ECHO chunk. This chunk MUST precede any DATA or SACK chunk sent * within the association, but MAY be bundled with one or more DATA * chunks or SACK chunk in the same SCTP packet. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 11 |Chunk Flags | Length = 4 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bits * * Set to zero on transmit and ignored on receipt. */ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [COOKIE ACK back to where the COOKIE ECHO came from.] */ if (retval && chunk && chunk->transport) retval->transport = sctp_assoc_lookup_paddr(asoc, &chunk->transport->ipaddr); return retval; } /* * Appendix A: Explicit Congestion Notification: * CWR: * * RFC 2481 details a specific bit for a sender to send in the header of * its next outbound TCP segment to indicate to its peer that it has * reduced its congestion window. This is termed the CWR bit. For * SCTP the same indication is made by including the CWR chunk. * This chunk contains one data element, i.e. the TSN number that * was sent in the ECNE chunk. This element represents the lowest * TSN number in the datagram that was originally marked with the * CE bit. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Chunk Type=13 | Flags=00000000| Chunk Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Lowest TSN Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Note: The CWR is considered a Control chunk. */ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc, const __u32 lowest_tsn, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; struct sctp_cwrhdr cwr; cwr.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_CWR, 0, sizeof(cwr), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.ecn_cwr_hdr = sctp_addto_chunk(retval, sizeof(cwr), &cwr); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [Report a reduced congestion window back to where the ECNE * came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* Make an ECNE chunk. This is a congestion experienced report. */ struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc, const __u32 lowest_tsn) { struct sctp_chunk *retval; struct sctp_ecnehdr ecne; ecne.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_ECNE, 0, sizeof(ecne), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.ecne_hdr = sctp_addto_chunk(retval, sizeof(ecne), &ecne); nodata: return retval; } /* Make a DATA chunk for the given association from the provided * parameters. However, do not populate the data payload. */ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp) { struct sctp_chunk *retval; struct sctp_datahdr dp; /* We assign the TSN as LATE as possible, not here when * creating the chunk. */ memset(&dp, 0, sizeof(dp)); dp.ppid = sinfo->sinfo_ppid; dp.stream = htons(sinfo->sinfo_stream); /* Set the flags for an unordered send. */ if (sinfo->sinfo_flags & SCTP_UNORDERED) flags |= SCTP_DATA_UNORDERED; retval = sctp_make_data(asoc, flags, sizeof(dp) + len, gfp); if (!retval) return NULL; retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); return retval; } /* Create a selective ackowledgement (SACK) for the given * association. This reports on which TSN's we've seen to date, * including duplicates and gaps. */ struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc) { struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map; struct sctp_gap_ack_block gabs[SCTP_MAX_GABS]; __u16 num_gabs, num_dup_tsns; struct sctp_transport *trans; struct sctp_chunk *retval; struct sctp_sackhdr sack; __u32 ctsn; int len; memset(gabs, 0, sizeof(gabs)); ctsn = sctp_tsnmap_get_ctsn(map); pr_debug("%s: sackCTSNAck sent:0x%x\n", __func__, ctsn); /* How much room is needed in the chunk? */ num_gabs = sctp_tsnmap_num_gabs(map, gabs); num_dup_tsns = sctp_tsnmap_num_dups(map); /* Initialize the SACK header. */ sack.cum_tsn_ack = htonl(ctsn); sack.a_rwnd = htonl(asoc->a_rwnd); sack.num_gap_ack_blocks = htons(num_gabs); sack.num_dup_tsns = htons(num_dup_tsns); len = sizeof(sack) + sizeof(struct sctp_gap_ack_block) * num_gabs + sizeof(__u32) * num_dup_tsns; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len, GFP_ATOMIC); if (!retval) goto nodata; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, etc.) to the same destination transport * address from which it received the DATA or control chunk to * which it is replying. This rule should also be followed if * the endpoint is bundling DATA chunks together with the * reply chunk. * * However, when acknowledging multiple DATA chunks received * in packets from different source addresses in a single * SACK, the SACK chunk may be transmitted to one of the * destination transport addresses from which the DATA or * control chunks being acknowledged were received. * * [BUG: We do not implement the following paragraph. * Perhaps we should remember the last transport we used for a * SACK and avoid that (if possible) if we have seen any * duplicates. --piggy] * * When a receiver of a duplicate DATA chunk sends a SACK to a * multi- homed endpoint it MAY be beneficial to vary the * destination address and not use the source address of the * DATA chunk. The reason being that receiving a duplicate * from a multi-homed endpoint might indicate that the return * path (as specified in the source address of the DATA chunk) * for the SACK is broken. * * [Send to the address from which we last received a DATA chunk.] */ retval->transport = asoc->peer.last_data_from; retval->subh.sack_hdr = sctp_addto_chunk(retval, sizeof(sack), &sack); /* Add the gap ack block information. */ if (num_gabs) sctp_addto_chunk(retval, sizeof(__u32) * num_gabs, gabs); /* Add the duplicate TSN information. */ if (num_dup_tsns) { asoc->stats.idupchunks += num_dup_tsns; sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns, sctp_tsnmap_get_dups(map)); } /* Once we have a sack generated, check to see what our sack * generation is, if its 0, reset the transports to 0, and reset * the association generation to 1 * * The idea is that zero is never used as a valid generation for the * association so no transport will match after a wrap event like this, * Until the next sack */ if (++asoc->peer.sack_generation == 0) { list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) trans->sack_generation = 0; asoc->peer.sack_generation = 1; } nodata: return retval; } /* Make a SHUTDOWN chunk. */ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_shutdownhdr shut; struct sctp_chunk *retval; __u32 ctsn; ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); shut.cum_tsn_ack = htonl(ctsn); retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0, sizeof(shut), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.shutdown_hdr = sctp_addto_chunk(retval, sizeof(shut), &shut); if (chunk) retval->transport = chunk->transport; nodata: return retval; } struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ACK back to where the SHUTDOWN came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } struct sctp_chunk *sctp_make_shutdown_complete( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; __u8 flags = 0; /* Set the T-bit if we have no association (vtag will be * reflected) */ flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T; retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [Report SHUTDOWN COMPLETE back to where the SHUTDOWN ACK * came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } /* Create an ABORT. Note that we set the T bit if we have no * association, except when responding to an INIT (sctpimpguide 2.41). */ struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const size_t hint) { struct sctp_chunk *retval; __u8 flags = 0; /* Set the T-bit if we have no association and 'chunk' is not * an INIT (vtag will be reflected). */ if (!asoc) { if (chunk && chunk->chunk_hdr && chunk->chunk_hdr->type == SCTP_CID_INIT) flags = 0; else flags = SCTP_CHUNK_FLAG_T; } retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ABORT back to where the offender came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } /* Helper to create ABORT with a NO_USER_DATA error. */ struct sctp_chunk *sctp_make_abort_no_data( const struct sctp_association *asoc, const struct sctp_chunk *chunk, __u32 tsn) { struct sctp_chunk *retval; __be32 payload; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + sizeof(tsn)); if (!retval) goto no_mem; /* Put the tsn back into network byte order. */ payload = htonl(tsn); sctp_init_cause(retval, SCTP_ERROR_NO_DATA, sizeof(payload)); sctp_addto_chunk(retval, sizeof(payload), (const void *)&payload); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ABORT back to where the offender came from.] */ if (chunk) retval->transport = chunk->transport; no_mem: return retval; } /* Helper to create ABORT with a SCTP_ERROR_USER_ABORT error. */ struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc, struct msghdr *msg, size_t paylen) { struct sctp_chunk *retval; void *payload = NULL; int err; retval = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr) + paylen); if (!retval) goto err_chunk; if (paylen) { /* Put the msg_iov together into payload. */ payload = kmalloc(paylen, GFP_KERNEL); if (!payload) goto err_payload; err = memcpy_from_msg(payload, msg, paylen); if (err < 0) goto err_copy; } sctp_init_cause(retval, SCTP_ERROR_USER_ABORT, paylen); sctp_addto_chunk(retval, paylen, payload); if (paylen) kfree(payload); return retval; err_copy: kfree(payload); err_payload: sctp_chunk_free(retval); retval = NULL; err_chunk: return retval; } /* Append bytes to the end of a parameter. Will panic if chunk is not big * enough. */ static void *sctp_addto_param(struct sctp_chunk *chunk, int len, const void *data) { int chunklen = ntohs(chunk->chunk_hdr->length); void *target; target = skb_put(chunk->skb, len); if (data) memcpy(target, data, len); else memset(target, 0, len); /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(chunklen + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return target; } /* Make an ABORT chunk with a PROTOCOL VIOLATION cause code. */ struct sctp_chunk *sctp_make_abort_violation( const struct sctp_association *asoc, const struct sctp_chunk *chunk, const __u8 *payload, const size_t paylen) { struct sctp_chunk *retval; struct sctp_paramhdr phdr; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + paylen + sizeof(phdr)); if (!retval) goto end; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, paylen + sizeof(phdr)); phdr.type = htons(chunk->chunk_hdr->type); phdr.length = chunk->chunk_hdr->length; sctp_addto_chunk(retval, paylen, payload); sctp_addto_param(retval, sizeof(phdr), &phdr); end: return retval; } struct sctp_chunk *sctp_make_violation_paramlen( const struct sctp_association *asoc, const struct sctp_chunk *chunk, struct sctp_paramhdr *param) { static const char error[] = "The following parameter had invalid length:"; size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr) + sizeof(*param); struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, payload_len); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, sizeof(error) + sizeof(*param)); sctp_addto_chunk(retval, sizeof(error), error); sctp_addto_param(retval, sizeof(*param), param); nodata: return retval; } struct sctp_chunk *sctp_make_violation_max_retrans( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { static const char error[] = "Association exceeded its max_retrans count"; size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr); struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, payload_len); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, sizeof(error)); sctp_addto_chunk(retval, sizeof(error), error); nodata: return retval; } struct sctp_chunk *sctp_make_new_encap_port(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_new_encap_port_hdr nep; struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + sizeof(nep)); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_NEW_ENCAP_PORT, sizeof(nep)); nep.cur_port = SCTP_INPUT_CB(chunk->skb)->encap_port; nep.new_port = chunk->transport->encap_port; sctp_addto_chunk(retval, sizeof(nep), &nep); nodata: return retval; } /* Make a HEARTBEAT chunk. */ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, const struct sctp_transport *transport, __u32 probe_size) { struct sctp_sender_hb_info hbinfo = {}; struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo), GFP_ATOMIC); if (!retval) goto nodata; hbinfo.param_hdr.type = SCTP_PARAM_HEARTBEAT_INFO; hbinfo.param_hdr.length = htons(sizeof(hbinfo)); hbinfo.daddr = transport->ipaddr; hbinfo.sent_at = jiffies; hbinfo.hb_nonce = transport->hb_nonce; hbinfo.probe_size = probe_size; /* Cast away the 'const', as this is just telling the chunk * what transport it belongs to. */ retval->transport = (struct sctp_transport *) transport; retval->subh.hbs_hdr = sctp_addto_chunk(retval, sizeof(hbinfo), &hbinfo); retval->pmtu_probe = !!probe_size; nodata: return retval; } struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const void *payload, const size_t paylen) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen, GFP_ATOMIC); if (!retval) goto nodata; retval->subh.hbs_hdr = sctp_addto_chunk(retval, paylen, payload); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [HBACK back to where the HEARTBEAT came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* RFC4820 3. Padding Chunk (PAD) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x84 | Flags=0 | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | | * \ Padding Data / * / \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_pad(const struct sctp_association *asoc, int len) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_PAD, 0, len, GFP_ATOMIC); if (!retval) return NULL; skb_put_zero(retval->skb, len); retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + len); retval->chunk_end = skb_tail_pointer(retval->skb); return retval; } /* Create an Operation Error chunk with the specified space reserved. * This routine can be used for containing multiple causes in the chunk. */ static struct sctp_chunk *sctp_make_op_error_space( const struct sctp_association *asoc, const struct sctp_chunk *chunk, size_t size) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_ERROR, 0, sizeof(struct sctp_errhdr) + size, GFP_ATOMIC); if (!retval) goto nodata; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, etc.) to the same destination transport * address from which it received the DATA or control chunk * to which it is replying. * */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* Create an Operation Error chunk of a fixed size, specifically, * min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads. * This is a helper function to allocate an error chunk for those * invalid parameter codes in which we may not want to report all the * errors, if the incoming chunk is large. If it can't fit in a single * packet, we ignore it. */ static inline struct sctp_chunk *sctp_make_op_error_limited( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { size_t size = SCTP_DEFAULT_MAXSEGMENT; struct sctp_sock *sp = NULL; if (asoc) { size = min_t(size_t, size, asoc->pathmtu); sp = sctp_sk(asoc->base.sk); } size = sctp_mtu_payload(sp, size, sizeof(struct sctp_errhdr)); return sctp_make_op_error_space(asoc, chunk, size); } /* Create an Operation Error chunk. */ struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc, const struct sctp_chunk *chunk, __be16 cause_code, const void *payload, size_t paylen, size_t reserve_tail) { struct sctp_chunk *retval; retval = sctp_make_op_error_space(asoc, chunk, paylen + reserve_tail); if (!retval) goto nodata; sctp_init_cause(retval, cause_code, paylen + reserve_tail); sctp_addto_chunk(retval, paylen, payload); if (reserve_tail) sctp_addto_param(retval, reserve_tail, NULL); nodata: return retval; } struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, __u16 key_id) { struct sctp_authhdr auth_hdr; struct sctp_hmac *hmac_desc; struct sctp_chunk *retval; /* Get the first hmac that the peer told us to use */ hmac_desc = sctp_auth_asoc_get_hmac(asoc); if (unlikely(!hmac_desc)) return NULL; retval = sctp_make_control(asoc, SCTP_CID_AUTH, 0, hmac_desc->hmac_len + sizeof(auth_hdr), GFP_ATOMIC); if (!retval) return NULL; auth_hdr.hmac_id = htons(hmac_desc->hmac_id); auth_hdr.shkey_id = htons(key_id); retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(auth_hdr), &auth_hdr); skb_put_zero(retval->skb, hmac_desc->hmac_len); /* Adjust the chunk header to include the empty MAC */ retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + hmac_desc->hmac_len); retval->chunk_end = skb_tail_pointer(retval->skb); return retval; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* Turn an skb into a chunk. * FIXME: Eventually move the structure directly inside the skb->cb[]. * * sctpimpguide-05.txt Section 2.8.2 * M1) Each time a new DATA chunk is transmitted * set the 'TSN.Missing.Report' count for that TSN to 0. The * 'TSN.Missing.Report' count will be used to determine missing chunks * and when to fast retransmit. * */ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb, const struct sctp_association *asoc, struct sock *sk, gfp_t gfp) { struct sctp_chunk *retval; retval = kmem_cache_zalloc(sctp_chunk_cachep, gfp); if (!retval) goto nodata; if (!sk) pr_debug("%s: chunkifying skb:%p w/o an sk\n", __func__, skb); INIT_LIST_HEAD(&retval->list); retval->skb = skb; retval->asoc = (struct sctp_association *)asoc; retval->singleton = 1; retval->fast_retransmit = SCTP_CAN_FRTX; /* Polish the bead hole. */ INIT_LIST_HEAD(&retval->transmitted_list); INIT_LIST_HEAD(&retval->frag_list); SCTP_DBG_OBJCNT_INC(chunk); refcount_set(&retval->refcnt, 1); nodata: return retval; } /* Set chunk->source and dest based on the IP header in chunk->skb. */ void sctp_init_addrs(struct sctp_chunk *chunk, union sctp_addr *src, union sctp_addr *dest) { memcpy(&chunk->source, src, sizeof(union sctp_addr)); memcpy(&chunk->dest, dest, sizeof(union sctp_addr)); } /* Extract the source address from a chunk. */ const union sctp_addr *sctp_source(const struct sctp_chunk *chunk) { /* If we have a known transport, use that. */ if (chunk->transport) { return &chunk->transport->ipaddr; } else { /* Otherwise, extract it from the IP header. */ return &chunk->source; } } /* Create a new chunk, setting the type and flags headers from the * arguments, reserving enough space for a 'paylen' byte payload. */ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp) { struct sctp_chunkhdr *chunk_hdr; struct sctp_chunk *retval; struct sk_buff *skb; struct sock *sk; int chunklen; chunklen = SCTP_PAD4(sizeof(*chunk_hdr) + paylen); if (chunklen > SCTP_MAX_CHUNK_LEN) goto nodata; /* No need to allocate LL here, as this is only a chunk. */ skb = alloc_skb(chunklen, gfp); if (!skb) goto nodata; /* Make room for the chunk header. */ chunk_hdr = (struct sctp_chunkhdr *)skb_put(skb, sizeof(*chunk_hdr)); chunk_hdr->type = type; chunk_hdr->flags = flags; chunk_hdr->length = htons(sizeof(*chunk_hdr)); sk = asoc ? asoc->base.sk : NULL; retval = sctp_chunkify(skb, asoc, sk, gfp); if (!retval) { kfree_skb(skb); goto nodata; } retval->chunk_hdr = chunk_hdr; retval->chunk_end = ((__u8 *)chunk_hdr) + sizeof(*chunk_hdr); /* Determine if the chunk needs to be authenticated */ if (sctp_auth_send_cid(type, asoc)) retval->auth = 1; return retval; nodata: return NULL; } static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp) { return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp); } struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp) { return _sctp_make_chunk(asoc, SCTP_CID_I_DATA, flags, paylen, gfp); } static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp) { struct sctp_chunk *chunk; chunk = _sctp_make_chunk(asoc, type, flags, paylen, gfp); if (chunk) sctp_control_set_owner_w(chunk); return chunk; } /* Release the memory occupied by a chunk. */ static void sctp_chunk_destroy(struct sctp_chunk *chunk) { BUG_ON(!list_empty(&chunk->list)); list_del_init(&chunk->transmitted_list); consume_skb(chunk->skb); consume_skb(chunk->auth_chunk); SCTP_DBG_OBJCNT_DEC(chunk); kmem_cache_free(sctp_chunk_cachep, chunk); } /* Possibly, free the chunk. */ void sctp_chunk_free(struct sctp_chunk *chunk) { /* Release our reference on the message tracker. */ if (chunk->msg) sctp_datamsg_put(chunk->msg); sctp_chunk_put(chunk); } /* Grab a reference to the chunk. */ void sctp_chunk_hold(struct sctp_chunk *ch) { refcount_inc(&ch->refcnt); } /* Release a reference to the chunk. */ void sctp_chunk_put(struct sctp_chunk *ch) { if (refcount_dec_and_test(&ch->refcnt)) sctp_chunk_destroy(ch); } /* Append bytes to the end of a chunk. Will panic if chunk is not big * enough. */ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data) { int chunklen = ntohs(chunk->chunk_hdr->length); int padlen = SCTP_PAD4(chunklen) - chunklen; void *target; skb_put_zero(chunk->skb, padlen); target = skb_put_data(chunk->skb, data, len); /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(chunklen + padlen + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return target; } /* Append bytes from user space to the end of a chunk. Will panic if * chunk is not big enough. * Returns a kernel err value. */ int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len, struct iov_iter *from) { void *target; /* Make room in chunk for data. */ target = skb_put(chunk->skb, len); /* Copy data (whole iovec) into chunk */ if (!copy_from_iter_full(target, len, from)) return -EFAULT; /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(ntohs(chunk->chunk_hdr->length) + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return 0; } /* Helper function to assign a TSN if needed. This assumes that both * the data_hdr and association have already been assigned. */ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk) { struct sctp_stream *stream; struct sctp_chunk *lchunk; struct sctp_datamsg *msg; __u16 ssn, sid; if (chunk->has_ssn) return; /* All fragments will be on the same stream */ sid = ntohs(chunk->subh.data_hdr->stream); stream = &chunk->asoc->stream; /* Now assign the sequence number to the entire message. * All fragments must have the same stream sequence number. */ msg = chunk->msg; list_for_each_entry(lchunk, &msg->chunks, frag_list) { if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { ssn = 0; } else { if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG) ssn = sctp_ssn_next(stream, out, sid); else ssn = sctp_ssn_peek(stream, out, sid); } lchunk->subh.data_hdr->ssn = htons(ssn); lchunk->has_ssn = 1; } } /* Helper function to assign a TSN if needed. This assumes that both * the data_hdr and association have already been assigned. */ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk) { if (!chunk->has_tsn) { /* This is the last possible instant to * assign a TSN. */ chunk->subh.data_hdr->tsn = htonl(sctp_association_get_next_tsn(chunk->asoc)); chunk->has_tsn = 1; } } /* Create a CLOSED association to use with an incoming packet. */ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_association *asoc; enum sctp_scope scope; struct sk_buff *skb; /* Create the bare association. */ scope = sctp_scope(sctp_source(chunk)); asoc = sctp_association_new(ep, ep->base.sk, scope, gfp); if (!asoc) goto nodata; asoc->temp = 1; skb = chunk->skb; /* Create an entry for the source address of the packet. */ SCTP_INPUT_CB(skb)->af->from_skb(&asoc->c.peer_addr, skb, 1); nodata: return asoc; } /* Build a cookie representing asoc. * This INCLUDES the param header needed to put the cookie in the INIT ACK. */ static struct sctp_cookie_param *sctp_pack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, const struct sctp_chunk *init_chunk, int *cookie_len, const __u8 *raw_addrs, int addrs_len) { struct sctp_signed_cookie *cookie; struct sctp_cookie_param *retval; int headersize, bodysize; /* Header size is static data prior to the actual cookie, including * any padding. */ headersize = sizeof(struct sctp_paramhdr) + (sizeof(struct sctp_signed_cookie) - sizeof(struct sctp_cookie)); bodysize = sizeof(struct sctp_cookie) + ntohs(init_chunk->chunk_hdr->length) + addrs_len; /* Pad out the cookie to a multiple to make the signature * functions simpler to write. */ if (bodysize % SCTP_COOKIE_MULTIPLE) bodysize += SCTP_COOKIE_MULTIPLE - (bodysize % SCTP_COOKIE_MULTIPLE); *cookie_len = headersize + bodysize; /* Clear this memory since we are sending this data structure * out on the network. */ retval = kzalloc(*cookie_len, GFP_ATOMIC); if (!retval) goto nodata; cookie = (struct sctp_signed_cookie *) retval->body; /* Set up the parameter header. */ retval->p.type = SCTP_PARAM_STATE_COOKIE; retval->p.length = htons(*cookie_len); /* Copy the cookie part of the association itself. */ cookie->c = asoc->c; /* Save the raw address list length in the cookie. */ cookie->c.raw_addr_list_len = addrs_len; /* Remember PR-SCTP capability. */ cookie->c.prsctp_capable = asoc->peer.prsctp_capable; /* Save adaptation indication in the cookie. */ cookie->c.adaptation_ind = asoc->peer.adaptation_ind; /* Set an expiration time for the cookie. */ cookie->c.expiration = ktime_add(asoc->cookie_life, ktime_get_real()); /* Copy the peer's init packet. */ memcpy(cookie + 1, init_chunk->chunk_hdr, ntohs(init_chunk->chunk_hdr->length)); /* Copy the raw local address list of the association. */ memcpy((__u8 *)(cookie + 1) + ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len); if (sctp_sk(ep->base.sk)->hmac) { struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; int err; /* Sign the message. */ err = crypto_shash_setkey(tfm, ep->secret_key, sizeof(ep->secret_key)) ?: crypto_shash_tfm_digest(tfm, (u8 *)&cookie->c, bodysize, cookie->signature); if (err) goto free_cookie; } return retval; free_cookie: kfree(retval); nodata: *cookie_len = 0; return NULL; } /* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */ struct sctp_association *sctp_unpack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, struct sctp_chunk *chunk, gfp_t gfp, int *error, struct sctp_chunk **errp) { struct sctp_association *retval = NULL; int headersize, bodysize, fixed_size; struct sctp_signed_cookie *cookie; struct sk_buff *skb = chunk->skb; struct sctp_cookie *bear_cookie; __u8 *digest = ep->digest; enum sctp_scope scope; unsigned int len; ktime_t kt; /* Header size is static data prior to the actual cookie, including * any padding. */ headersize = sizeof(struct sctp_chunkhdr) + (sizeof(struct sctp_signed_cookie) - sizeof(struct sctp_cookie)); bodysize = ntohs(chunk->chunk_hdr->length) - headersize; fixed_size = headersize + sizeof(struct sctp_cookie); /* Verify that the chunk looks like it even has a cookie. * There must be enough room for our cookie and our peer's * INIT chunk. */ len = ntohs(chunk->chunk_hdr->length); if (len < fixed_size + sizeof(struct sctp_chunkhdr)) goto malformed; /* Verify that the cookie has been padded out. */ if (bodysize % SCTP_COOKIE_MULTIPLE) goto malformed; /* Process the cookie. */ cookie = chunk->subh.cookie_hdr; bear_cookie = &cookie->c; if (!sctp_sk(ep->base.sk)->hmac) goto no_hmac; /* Check the signature. */ { struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; int err; err = crypto_shash_setkey(tfm, ep->secret_key, sizeof(ep->secret_key)) ?: crypto_shash_tfm_digest(tfm, (u8 *)bear_cookie, bodysize, digest); if (err) { *error = -SCTP_IERROR_NOMEM; goto fail; } } if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { *error = -SCTP_IERROR_BAD_SIG; goto fail; } no_hmac: /* IG Section 2.35.2: * 3) Compare the port numbers and the verification tag contained * within the COOKIE ECHO chunk to the actual port numbers and the * verification tag within the SCTP common header of the received * packet. If these values do not match the packet MUST be silently * discarded, */ if (ntohl(chunk->sctp_hdr->vtag) != bear_cookie->my_vtag) { *error = -SCTP_IERROR_BAD_TAG; goto fail; } if (chunk->sctp_hdr->source != bear_cookie->peer_addr.v4.sin_port || ntohs(chunk->sctp_hdr->dest) != bear_cookie->my_port) { *error = -SCTP_IERROR_BAD_PORTS; goto fail; } /* Check to see if the cookie is stale. If there is already * an association, there is no need to check cookie's expiration * for init collision case of lost COOKIE ACK. * If skb has been timestamped, then use the stamp, otherwise * use current time. This introduces a small possibility that * a cookie may be considered expired, but this would only slow * down the new association establishment instead of every packet. */ if (sock_flag(ep->base.sk, SOCK_TIMESTAMP)) kt = skb_get_ktime(skb); else kt = ktime_get_real(); if (!asoc && ktime_before(bear_cookie->expiration, kt)) { suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration)); __be32 n = htonl(usecs); /* * Section 3.3.10.3 Stale Cookie Error (3) * * Cause of error * --------------- * Stale Cookie Error: Indicates the receipt of a valid State * Cookie that has expired. */ *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_STALE_COOKIE, &n, sizeof(n), 0); if (*errp) *error = -SCTP_IERROR_STALE_COOKIE; else *error = -SCTP_IERROR_NOMEM; goto fail; } /* Make a new base association. */ scope = sctp_scope(sctp_source(chunk)); retval = sctp_association_new(ep, ep->base.sk, scope, gfp); if (!retval) { *error = -SCTP_IERROR_NOMEM; goto fail; } /* Set up our peer's port number. */ retval->peer.port = ntohs(chunk->sctp_hdr->source); /* Populate the association from the cookie. */ memcpy(&retval->c, bear_cookie, sizeof(*bear_cookie)); if (sctp_assoc_set_bind_addr_from_cookie(retval, bear_cookie, GFP_ATOMIC) < 0) { *error = -SCTP_IERROR_NOMEM; goto fail; } /* Also, add the destination address. */ if (list_empty(&retval->base.bind_addr.address_list)) { sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest, sizeof(chunk->dest), SCTP_ADDR_SRC, GFP_ATOMIC); } retval->next_tsn = retval->c.initial_tsn; retval->ctsn_ack_point = retval->next_tsn - 1; retval->addip_serial = retval->c.initial_tsn; retval->strreset_outseq = retval->c.initial_tsn; retval->adv_peer_ack_point = retval->ctsn_ack_point; retval->peer.prsctp_capable = retval->c.prsctp_capable; retval->peer.adaptation_ind = retval->c.adaptation_ind; /* The INIT stuff will be done by the side effects. */ return retval; fail: if (retval) sctp_association_free(retval); return NULL; malformed: /* Yikes! The packet is either corrupt or deliberately * malformed. */ *error = -SCTP_IERROR_MALFORMED; goto fail; } /******************************************************************** * 3rd Level Abstractions ********************************************************************/ struct __sctp_missing { __be32 num_missing; __be16 type; } __packed; /* * Report a missing mandatory parameter. */ static int sctp_process_missing_param(const struct sctp_association *asoc, enum sctp_param paramtype, struct sctp_chunk *chunk, struct sctp_chunk **errp) { struct __sctp_missing report; __u16 len; len = SCTP_PAD4(sizeof(report)); /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. */ if (!*errp) *errp = sctp_make_op_error_space(asoc, chunk, len); if (*errp) { report.num_missing = htonl(1); report.type = paramtype; sctp_init_cause(*errp, SCTP_ERROR_MISS_PARAM, sizeof(report)); sctp_addto_chunk(*errp, sizeof(report), &report); } /* Stop processing this chunk. */ return 0; } /* Report an Invalid Mandatory Parameter. */ static int sctp_process_inv_mandatory(const struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_chunk **errp) { /* Invalid Mandatory Parameter Error has no payload. */ if (!*errp) *errp = sctp_make_op_error_space(asoc, chunk, 0); if (*errp) sctp_init_cause(*errp, SCTP_ERROR_INV_PARAM, 0); /* Stop processing this chunk. */ return 0; } static int sctp_process_inv_paramlength(const struct sctp_association *asoc, struct sctp_paramhdr *param, const struct sctp_chunk *chunk, struct sctp_chunk **errp) { /* This is a fatal error. Any accumulated non-fatal errors are * not reported. */ if (*errp) sctp_chunk_free(*errp); /* Create an error chunk and fill it in with our payload. */ *errp = sctp_make_violation_paramlen(asoc, chunk, param); return 0; } /* Do not attempt to handle the HOST_NAME parm. However, do * send back an indicator to the peer. */ static int sctp_process_hn_param(const struct sctp_association *asoc, union sctp_params param, struct sctp_chunk *chunk, struct sctp_chunk **errp) { __u16 len = ntohs(param.p->length); /* Processing of the HOST_NAME parameter will generate an * ABORT. If we've accumulated any non-fatal errors, they * would be unrecognized parameters and we should not include * them in the ABORT. */ if (*errp) sctp_chunk_free(*errp); *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_DNS_FAILED, param.v, len, 0); /* Stop processing this chunk. */ return 0; } static int sctp_verify_ext_param(struct net *net, const struct sctp_endpoint *ep, union sctp_params param) { __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); int have_asconf = 0; int have_auth = 0; int i; for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_AUTH: have_auth = 1; break; case SCTP_CID_ASCONF: case SCTP_CID_ASCONF_ACK: have_asconf = 1; break; } } /* ADD-IP Security: The draft requires us to ABORT or ignore the * INIT/INIT-ACK if ADD-IP is listed, but AUTH is not. Do this * only if ADD-IP is turned on and we are not backward-compatible * mode. */ if (net->sctp.addip_noauth) return 1; if (ep->asconf_enable && !have_auth && have_asconf) return 0; return 1; } static void sctp_process_ext_param(struct sctp_association *asoc, union sctp_params param) { __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); int i; for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_RECONF: if (asoc->ep->reconf_enable) asoc->peer.reconf_capable = 1; break; case SCTP_CID_FWD_TSN: if (asoc->ep->prsctp_enable) asoc->peer.prsctp_capable = 1; break; case SCTP_CID_AUTH: /* if the peer reports AUTH, assume that he * supports AUTH. */ if (asoc->ep->auth_enable) asoc->peer.auth_capable = 1; break; case SCTP_CID_ASCONF: case SCTP_CID_ASCONF_ACK: if (asoc->ep->asconf_enable) asoc->peer.asconf_capable = 1; break; case SCTP_CID_I_DATA: if (asoc->ep->intl_enable) asoc->peer.intl_capable = 1; break; default: break; } } } /* RFC 3.2.1 & the Implementers Guide 2.2. * * The Parameter Types are encoded such that the * highest-order two bits specify the action that must be * taken if the processing endpoint does not recognize the * Parameter Type. * * 00 - Stop processing this parameter; do not process any further * parameters within this chunk * * 01 - Stop processing this parameter, do not process any further * parameters within this chunk, and report the unrecognized * parameter in an 'Unrecognized Parameter' ERROR chunk. * * 10 - Skip this parameter and continue processing. * * 11 - Skip this parameter and continue processing but * report the unrecognized parameter in an * 'Unrecognized Parameter' ERROR chunk. * * Return value: * SCTP_IERROR_NO_ERROR - continue with the chunk * SCTP_IERROR_ERROR - stop and report an error. * SCTP_IERROR_NOMEME - out of memory. */ static enum sctp_ierror sctp_process_unk_param( const struct sctp_association *asoc, union sctp_params param, struct sctp_chunk *chunk, struct sctp_chunk **errp) { int retval = SCTP_IERROR_NO_ERROR; switch (param.p->type & SCTP_PARAM_ACTION_MASK) { case SCTP_PARAM_ACTION_DISCARD: retval = SCTP_IERROR_ERROR; break; case SCTP_PARAM_ACTION_SKIP: break; case SCTP_PARAM_ACTION_DISCARD_ERR: retval = SCTP_IERROR_ERROR; fallthrough; case SCTP_PARAM_ACTION_SKIP_ERR: /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. */ if (!*errp) { *errp = sctp_make_op_error_limited(asoc, chunk); if (!*errp) { /* If there is no memory for generating the * ERROR report as specified, an ABORT will be * triggered to the peer and the association * won't be established. */ retval = SCTP_IERROR_NOMEM; break; } } if (!sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM, ntohs(param.p->length))) sctp_addto_chunk(*errp, ntohs(param.p->length), param.v); break; default: break; } return retval; } /* Verify variable length parameters * Return values: * SCTP_IERROR_ABORT - trigger an ABORT * SCTP_IERROR_NOMEM - out of memory (abort) * SCTP_IERROR_ERROR - stop processing, trigger an ERROR * SCTP_IERROR_NO_ERROR - continue with the chunk */ static enum sctp_ierror sctp_verify_param(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, union sctp_params param, enum sctp_cid cid, struct sctp_chunk *chunk, struct sctp_chunk **err_chunk) { struct sctp_hmac_algo_param *hmacs; int retval = SCTP_IERROR_NO_ERROR; __u16 n_elt, id = 0; int i; /* FIXME - This routine is not looking at each parameter per the * chunk type, i.e., unrecognized parameters should be further * identified based on the chunk id. */ switch (param.p->type) { case SCTP_PARAM_IPV4_ADDRESS: case SCTP_PARAM_IPV6_ADDRESS: case SCTP_PARAM_COOKIE_PRESERVATIVE: case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: case SCTP_PARAM_STATE_COOKIE: case SCTP_PARAM_HEARTBEAT_INFO: case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: case SCTP_PARAM_ECN_CAPABLE: case SCTP_PARAM_ADAPTATION_LAYER_IND: break; case SCTP_PARAM_SUPPORTED_EXT: if (!sctp_verify_ext_param(net, ep, param)) return SCTP_IERROR_ABORT; break; case SCTP_PARAM_SET_PRIMARY: if (!ep->asconf_enable) goto unhandled; if (ntohs(param.p->length) < sizeof(struct sctp_addip_param) + sizeof(struct sctp_paramhdr)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_HOST_NAME_ADDRESS: /* This param has been Deprecated, send ABORT. */ sctp_process_hn_param(asoc, param, chunk, err_chunk); retval = SCTP_IERROR_ABORT; break; case SCTP_PARAM_FWD_TSN_SUPPORT: if (ep->prsctp_enable) break; goto unhandled; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) goto unhandled; /* SCTP-AUTH: Secion 6.1 * If the random number is not 32 byte long the association * MUST be aborted. The ABORT chunk SHOULD contain the error * cause 'Protocol Violation'. */ if (SCTP_AUTH_RANDOM_LENGTH != ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) goto unhandled; /* SCTP-AUTH: Section 3.2 * The CHUNKS parameter MUST be included once in the INIT or * INIT-ACK chunk if the sender wants to receive authenticated * chunks. Its maximum length is 260 bytes. */ if (260 < ntohs(param.p->length)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) goto unhandled; hmacs = (struct sctp_hmac_algo_param *)param.p; n_elt = (ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) >> 1; /* SCTP-AUTH: Section 6.1 * The HMAC algorithm based on SHA-1 MUST be supported and * included in the HMAC-ALGO parameter. */ for (i = 0; i < n_elt; i++) { id = ntohs(hmacs->hmac_ids[i]); if (id == SCTP_AUTH_HMAC_ID_SHA1) break; } if (id != SCTP_AUTH_HMAC_ID_SHA1) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; unhandled: default: pr_debug("%s: unrecognized param:%d for chunk:%d\n", __func__, ntohs(param.p->type), cid); retval = sctp_process_unk_param(asoc, param, chunk, err_chunk); break; } return retval; } /* Verify the INIT packet before we process it. */ int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, enum sctp_cid cid, struct sctp_init_chunk *peer_init, struct sctp_chunk *chunk, struct sctp_chunk **errp) { union sctp_params param; bool has_cookie = false; int result; /* Check for missing mandatory parameters. Note: Initial TSN is * also mandatory, but is not checked here since the valid range * is 0..2**32-1. RFC4960, section 3.3.3. */ if (peer_init->init_hdr.num_outbound_streams == 0 || peer_init->init_hdr.num_inbound_streams == 0 || peer_init->init_hdr.init_tag == 0 || ntohl(peer_init->init_hdr.a_rwnd) < SCTP_DEFAULT_MINWINDOW) return sctp_process_inv_mandatory(asoc, chunk, errp); sctp_walk_params(param, peer_init) { if (param.p->type == SCTP_PARAM_STATE_COOKIE) has_cookie = true; } /* There is a possibility that a parameter length was bad and * in that case we would have stoped walking the parameters. * The current param.p would point at the bad one. * Current consensus on the mailing list is to generate a PROTOCOL * VIOLATION error. We build the ERROR chunk here and let the normal * error handling code build and send the packet. */ if (param.v != (void *)chunk->chunk_end) return sctp_process_inv_paramlength(asoc, param.p, chunk, errp); /* The only missing mandatory param possible today is * the state cookie for an INIT-ACK chunk. */ if ((SCTP_CID_INIT_ACK == cid) && !has_cookie) return sctp_process_missing_param(asoc, SCTP_PARAM_STATE_COOKIE, chunk, errp); /* Verify all the variable length parameters */ sctp_walk_params(param, peer_init) { result = sctp_verify_param(net, ep, asoc, param, cid, chunk, errp); switch (result) { case SCTP_IERROR_ABORT: case SCTP_IERROR_NOMEM: return 0; case SCTP_IERROR_ERROR: return 1; case SCTP_IERROR_NO_ERROR: default: break; } } /* for (loop through all parameters) */ return 1; } /* Unpack the parameters in an INIT packet into an association. * Returns 0 on failure, else success. * FIXME: This is an association method. */ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, const union sctp_addr *peer_addr, struct sctp_init_chunk *peer_init, gfp_t gfp) { struct sctp_transport *transport; struct list_head *pos, *temp; union sctp_params param; union sctp_addr addr; struct sctp_af *af; int src_match = 0; /* We must include the address that the INIT packet came from. * This is the only address that matters for an INIT packet. * When processing a COOKIE ECHO, we retrieve the from address * of the INIT from the cookie. */ /* This implementation defaults to making the first transport * added as the primary transport. The source address seems to * be a better choice than any of the embedded addresses. */ asoc->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port; if (!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE)) goto nomem; if (sctp_cmp_addr_exact(sctp_source(chunk), peer_addr)) src_match = 1; /* Process the initialization parameters. */ sctp_walk_params(param, peer_init) { if (!src_match && (param.p->type == SCTP_PARAM_IPV4_ADDRESS || param.p->type == SCTP_PARAM_IPV6_ADDRESS)) { af = sctp_get_af_specific(param_type2af(param.p->type)); if (!af->from_addr_param(&addr, param.addr, chunk->sctp_hdr->source, 0)) continue; if (sctp_cmp_addr_exact(sctp_source(chunk), &addr)) src_match = 1; } if (!sctp_process_param(asoc, param, peer_addr, gfp)) goto clean_up; } /* source address of chunk may not match any valid address */ if (!src_match) goto clean_up; /* AUTH: After processing the parameters, make sure that we * have all the required info to potentially do authentications. */ if (asoc->peer.auth_capable && (!asoc->peer.peer_random || !asoc->peer.peer_hmacs)) asoc->peer.auth_capable = 0; /* In a non-backward compatible mode, if the peer claims * support for ADD-IP but not AUTH, the ADD-IP spec states * that we MUST ABORT the association. Section 6. The section * also give us an option to silently ignore the packet, which * is what we'll do here. */ if (!asoc->base.net->sctp.addip_noauth && (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) { asoc->peer.addip_disabled_mask |= (SCTP_PARAM_ADD_IP | SCTP_PARAM_DEL_IP | SCTP_PARAM_SET_PRIMARY); asoc->peer.asconf_capable = 0; goto clean_up; } /* Walk list of transports, removing transports in the UNKNOWN state. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { transport = list_entry(pos, struct sctp_transport, transports); if (transport->state == SCTP_UNKNOWN) { sctp_assoc_rm_peer(asoc, transport); } } /* The fixed INIT headers are always in network byte * order. */ asoc->peer.i.init_tag = ntohl(peer_init->init_hdr.init_tag); asoc->peer.i.a_rwnd = ntohl(peer_init->init_hdr.a_rwnd); asoc->peer.i.num_outbound_streams = ntohs(peer_init->init_hdr.num_outbound_streams); asoc->peer.i.num_inbound_streams = ntohs(peer_init->init_hdr.num_inbound_streams); asoc->peer.i.initial_tsn = ntohl(peer_init->init_hdr.initial_tsn); asoc->strreset_inseq = asoc->peer.i.initial_tsn; /* Apply the upper bounds for output streams based on peer's * number of inbound streams. */ if (asoc->c.sinit_num_ostreams > ntohs(peer_init->init_hdr.num_inbound_streams)) { asoc->c.sinit_num_ostreams = ntohs(peer_init->init_hdr.num_inbound_streams); } if (asoc->c.sinit_max_instreams > ntohs(peer_init->init_hdr.num_outbound_streams)) { asoc->c.sinit_max_instreams = ntohs(peer_init->init_hdr.num_outbound_streams); } /* Copy Initiation tag from INIT to VT_peer in cookie. */ asoc->c.peer_vtag = asoc->peer.i.init_tag; /* Peer Rwnd : Current calculated value of the peer's rwnd. */ asoc->peer.rwnd = asoc->peer.i.a_rwnd; /* RFC 2960 7.2.1 The initial value of ssthresh MAY be arbitrarily * high (for example, implementations MAY use the size of the receiver * advertised window). */ list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { transport->ssthresh = asoc->peer.i.a_rwnd; } /* Set up the TSN tracking pieces. */ if (!sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, asoc->peer.i.initial_tsn, gfp)) goto clean_up; /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number * * The stream sequence number in all the streams shall start * from 0 when the association is established. Also, when the * stream sequence number reaches the value 65535 the next * stream sequence number shall be set to 0. */ if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, asoc->c.sinit_max_instreams, gfp)) goto clean_up; /* Update frag_point when stream_interleave may get changed. */ sctp_assoc_update_frag_point(asoc); if (!asoc->temp && sctp_assoc_set_id(asoc, gfp)) goto clean_up; /* ADDIP Section 4.1 ASCONF Chunk Procedures * * When an endpoint has an ASCONF signaled change to be sent to the * remote endpoint it should do the following: * ... * A2) A serial number should be assigned to the Chunk. The serial * number should be a monotonically increasing number. All serial * numbers are defined to be initialized at the start of the * association to the same value as the Initial TSN. */ asoc->peer.addip_serial = asoc->peer.i.initial_tsn - 1; return 1; clean_up: /* Release the transport structures. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { transport = list_entry(pos, struct sctp_transport, transports); if (transport->state != SCTP_ACTIVE) sctp_assoc_rm_peer(asoc, transport); } nomem: return 0; } /* Update asoc with the option described in param. * * RFC2960 3.3.2.1 Optional/Variable Length Parameters in INIT * * asoc is the association to update. * param is the variable length parameter to use for update. * cid tells us if this is an INIT, INIT ACK or COOKIE ECHO. * If the current packet is an INIT we want to minimize the amount of * work we do. In particular, we should not build transport * structures for the addresses. */ static int sctp_process_param(struct sctp_association *asoc, union sctp_params param, const union sctp_addr *peer_addr, gfp_t gfp) { struct sctp_endpoint *ep = asoc->ep; union sctp_addr_param *addr_param; struct net *net = asoc->base.net; struct sctp_transport *t; enum sctp_scope scope; union sctp_addr addr; struct sctp_af *af; int retval = 1, i; u32 stale; __u16 sat; /* We maintain all INIT parameters in network byte order all the * time. This allows us to not worry about whether the parameters * came from a fresh INIT, and INIT ACK, or were stored in a cookie. */ switch (param.p->type) { case SCTP_PARAM_IPV6_ADDRESS: if (PF_INET6 != asoc->base.sk->sk_family) break; goto do_addr_param; case SCTP_PARAM_IPV4_ADDRESS: /* v4 addresses are not allowed on v6-only socket */ if (ipv6_only_sock(asoc->base.sk)) break; do_addr_param: af = sctp_get_af_specific(param_type2af(param.p->type)); if (!af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0)) break; scope = sctp_scope(peer_addr); if (sctp_in_scope(net, &addr, scope)) if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_UNCONFIRMED)) return 0; break; case SCTP_PARAM_COOKIE_PRESERVATIVE: if (!net->sctp.cookie_preserve_enable) break; stale = ntohl(param.life->lifespan_increment); /* Suggested Cookie Life span increment's unit is msec, * (1/1000sec). */ asoc->cookie_life = ktime_add_ms(asoc->cookie_life, stale); break; case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: /* Turn off the default values first so we'll know which * ones are really set by the peer. */ asoc->peer.ipv4_address = 0; asoc->peer.ipv6_address = 0; /* Assume that peer supports the address family * by which it sends a packet. */ if (peer_addr->sa.sa_family == AF_INET6) asoc->peer.ipv6_address = 1; else if (peer_addr->sa.sa_family == AF_INET) asoc->peer.ipv4_address = 1; /* Cycle through address types; avoid divide by 0. */ sat = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); if (sat) sat /= sizeof(__u16); for (i = 0; i < sat; ++i) { switch (param.sat->types[i]) { case SCTP_PARAM_IPV4_ADDRESS: asoc->peer.ipv4_address = 1; break; case SCTP_PARAM_IPV6_ADDRESS: if (PF_INET6 == asoc->base.sk->sk_family) asoc->peer.ipv6_address = 1; break; default: /* Just ignore anything else. */ break; } } break; case SCTP_PARAM_STATE_COOKIE: asoc->peer.cookie_len = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); kfree(asoc->peer.cookie); asoc->peer.cookie = kmemdup(param.cookie->body, asoc->peer.cookie_len, gfp); if (!asoc->peer.cookie) retval = 0; break; case SCTP_PARAM_HEARTBEAT_INFO: /* Would be odd to receive, but it causes no problems. */ break; case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: /* Rejected during verify stage. */ break; case SCTP_PARAM_ECN_CAPABLE: if (asoc->ep->ecn_enable) { asoc->peer.ecn_capable = 1; break; } /* Fall Through */ goto fall_through; case SCTP_PARAM_ADAPTATION_LAYER_IND: asoc->peer.adaptation_ind = ntohl(param.aind->adaptation_ind); break; case SCTP_PARAM_SET_PRIMARY: if (!ep->asconf_enable) goto fall_through; addr_param = param.v + sizeof(struct sctp_addip_param); af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (!af) break; if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) break; if (!af->addr_valid(&addr, NULL, NULL)) break; t = sctp_assoc_lookup_paddr(asoc, &addr); if (!t) break; sctp_assoc_set_primary(asoc, t); break; case SCTP_PARAM_SUPPORTED_EXT: sctp_process_ext_param(asoc, param); break; case SCTP_PARAM_FWD_TSN_SUPPORT: if (asoc->ep->prsctp_enable) { asoc->peer.prsctp_capable = 1; break; } /* Fall Through */ goto fall_through; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) goto fall_through; /* Save peer's random parameter */ kfree(asoc->peer.peer_random); asoc->peer.peer_random = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_random) { retval = 0; break; } break; case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) goto fall_through; /* Save peer's HMAC list */ kfree(asoc->peer.peer_hmacs); asoc->peer.peer_hmacs = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_hmacs) { retval = 0; break; } /* Set the default HMAC the peer requested*/ sctp_auth_asoc_set_default_hmac(asoc, param.hmac_algo); break; case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) goto fall_through; kfree(asoc->peer.peer_chunks); asoc->peer.peer_chunks = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_chunks) retval = 0; break; fall_through: default: /* Any unrecognized parameters should have been caught * and handled by sctp_verify_param() which should be * called prior to this routine. Simply log the error * here. */ pr_debug("%s: ignoring param:%d for association:%p.\n", __func__, ntohs(param.p->type), asoc); break; } return retval; } /* Select a new verification tag. */ __u32 sctp_generate_tag(const struct sctp_endpoint *ep) { /* I believe that this random number generator complies with RFC1750. * A tag of 0 is reserved for special cases (e.g. INIT). */ __u32 x; do { get_random_bytes(&x, sizeof(__u32)); } while (x == 0); return x; } /* Select an initial TSN to send during startup. */ __u32 sctp_generate_tsn(const struct sctp_endpoint *ep) { __u32 retval; get_random_bytes(&retval, sizeof(__u32)); return retval; } /* * ADDIP 3.1.1 Address Configuration Change Chunk (ASCONF) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC1 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Serial Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter #1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / .... / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter #N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Address Parameter and other parameter will not be wrapped in this function */ static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc, union sctp_addr *addr, int vparam_len) { struct sctp_addiphdr asconf; struct sctp_chunk *retval; int length = sizeof(asconf) + vparam_len; union sctp_addr_param addrparam; int addrlen; struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); addrlen = af->to_addr_param(addr, &addrparam); if (!addrlen) return NULL; length += addrlen; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length, GFP_ATOMIC); if (!retval) return NULL; asconf.serial = htonl(asoc->addip_serial++); retval->subh.addip_hdr = sctp_addto_chunk(retval, sizeof(asconf), &asconf); retval->param_hdr.v = sctp_addto_chunk(retval, addrlen, &addrparam); return retval; } /* ADDIP * 3.2.1 Add IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC001 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * 3.2.2 Delete IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC002 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * */ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc, union sctp_addr *laddr, struct sockaddr *addrs, int addrcnt, __be16 flags) { union sctp_addr_param addr_param; struct sctp_addip_param param; int paramlen = sizeof(param); struct sctp_chunk *retval; int addr_param_len = 0; union sctp_addr *addr; int totallen = 0, i; int del_pickup = 0; struct sctp_af *af; void *addr_buf; /* Get total length of all the address parameters. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); totallen += paramlen; totallen += addr_param_len; addr_buf += af->sockaddr_len; if (asoc->asconf_addr_del_pending && !del_pickup) { /* reuse the parameter length from the same scope one */ totallen += paramlen; totallen += addr_param_len; del_pickup = 1; pr_debug("%s: picked same-scope del_pending addr, " "totallen for all addresses is %d\n", __func__, totallen); } } /* Create an asconf chunk with the required length. */ retval = sctp_make_asconf(asoc, laddr, totallen); if (!retval) return NULL; /* Add the address parameters to the asconf chunk. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = flags; param.param_hdr.length = htons(paramlen + addr_param_len); param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, &param); sctp_addto_chunk(retval, addr_param_len, &addr_param); addr_buf += af->sockaddr_len; } if (flags == SCTP_PARAM_ADD_IP && del_pickup) { addr = asoc->asconf_addr_del_pending; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = SCTP_PARAM_DEL_IP; param.param_hdr.length = htons(paramlen + addr_param_len); param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, &param); sctp_addto_chunk(retval, addr_param_len, &addr_param); } return retval; } /* ADDIP * 3.2.4 Set Primary IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type =0xC004 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Create an ASCONF chunk with Set Primary IP address parameter. */ struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc, union sctp_addr *addr) { struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); union sctp_addr_param addrparam; struct sctp_addip_param param; struct sctp_chunk *retval; int len = sizeof(param); int addrlen; addrlen = af->to_addr_param(addr, &addrparam); if (!addrlen) return NULL; len += addrlen; /* Create the chunk and make asconf header. */ retval = sctp_make_asconf(asoc, addr, len); if (!retval) return NULL; param.param_hdr.type = SCTP_PARAM_SET_PRIMARY; param.param_hdr.length = htons(len); param.crr_id = 0; sctp_addto_chunk(retval, sizeof(param), &param); sctp_addto_chunk(retval, addrlen, &addrparam); return retval; } /* ADDIP 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x80 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Serial Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter Response#1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / .... / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter Response#N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Create an ASCONF_ACK chunk with enough space for the parameter responses. */ static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *asoc, __u32 serial, int vparam_len) { struct sctp_addiphdr asconf; struct sctp_chunk *retval; int length = sizeof(asconf) + vparam_len; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length, GFP_ATOMIC); if (!retval) return NULL; asconf.serial = htonl(serial); retval->subh.addip_hdr = sctp_addto_chunk(retval, sizeof(asconf), &asconf); return retval; } /* Add response parameters to an ASCONF_ACK chunk. */ static void sctp_add_asconf_response(struct sctp_chunk *chunk, __be32 crr_id, __be16 err_code, struct sctp_addip_param *asconf_param) { struct sctp_addip_param ack_param; struct sctp_errhdr err_param; int asconf_param_len = 0; int err_param_len = 0; __be16 response_type; if (SCTP_ERROR_NO_ERROR == err_code) { response_type = SCTP_PARAM_SUCCESS_REPORT; } else { response_type = SCTP_PARAM_ERR_CAUSE; err_param_len = sizeof(err_param); if (asconf_param) asconf_param_len = ntohs(asconf_param->param_hdr.length); } /* Add Success Indication or Error Cause Indication parameter. */ ack_param.param_hdr.type = response_type; ack_param.param_hdr.length = htons(sizeof(ack_param) + err_param_len + asconf_param_len); ack_param.crr_id = crr_id; sctp_addto_chunk(chunk, sizeof(ack_param), &ack_param); if (SCTP_ERROR_NO_ERROR == err_code) return; /* Add Error Cause parameter. */ err_param.cause = err_code; err_param.length = htons(err_param_len + asconf_param_len); sctp_addto_chunk(chunk, err_param_len, &err_param); /* Add the failed TLV copied from ASCONF chunk. */ if (asconf_param) sctp_addto_chunk(chunk, asconf_param_len, asconf_param); } /* Process a asconf parameter. */ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, struct sctp_chunk *asconf, struct sctp_addip_param *asconf_param) { union sctp_addr_param *addr_param; struct sctp_transport *peer; union sctp_addr addr; struct sctp_af *af; addr_param = (void *)asconf_param + sizeof(*asconf_param); if (asconf_param->param_hdr.type != SCTP_PARAM_ADD_IP && asconf_param->param_hdr.type != SCTP_PARAM_DEL_IP && asconf_param->param_hdr.type != SCTP_PARAM_SET_PRIMARY) return SCTP_ERROR_UNKNOWN_PARAM; switch (addr_param->p.type) { case SCTP_PARAM_IPV6_ADDRESS: if (!asoc->peer.ipv6_address) return SCTP_ERROR_DNS_FAILED; break; case SCTP_PARAM_IPV4_ADDRESS: if (!asoc->peer.ipv4_address) return SCTP_ERROR_DNS_FAILED; break; default: return SCTP_ERROR_DNS_FAILED; } af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (unlikely(!af)) return SCTP_ERROR_DNS_FAILED; if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) return SCTP_ERROR_DNS_FAILED; /* ADDIP 4.2.1 This parameter MUST NOT contain a broadcast * or multicast address. * (note: wildcard is permitted and requires special handling so * make sure we check for that) */ if (!af->is_any(&addr) && !af->addr_valid(&addr, NULL, asconf->skb)) return SCTP_ERROR_DNS_FAILED; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: /* Section 4.2.1: * If the address 0.0.0.0 or ::0 is provided, the source * address of the packet MUST be added. */ if (af->is_any(&addr)) memcpy(&addr, &asconf->source, sizeof(addr)); if (security_sctp_bind_connect(asoc->ep->base.sk, SCTP_PARAM_ADD_IP, (struct sockaddr *)&addr, af->sockaddr_len)) return SCTP_ERROR_REQ_REFUSED; /* ADDIP 4.3 D9) If an endpoint receives an ADD IP address * request and does not have the local resources to add this * new address to the association, it MUST return an Error * Cause TLV set to the new error code 'Operation Refused * Due to Resource Shortage'. */ peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC, SCTP_UNCONFIRMED); if (!peer) return SCTP_ERROR_RSRC_LOW; /* Start the heartbeat timer. */ sctp_transport_reset_hb_timer(peer); asoc->new_transport = peer; break; case SCTP_PARAM_DEL_IP: /* ADDIP 4.3 D7) If a request is received to delete the * last remaining IP address of a peer endpoint, the receiver * MUST send an Error Cause TLV with the error cause set to the * new error code 'Request to Delete Last Remaining IP Address'. */ if (asoc->peer.transport_count == 1) return SCTP_ERROR_DEL_LAST_IP; /* ADDIP 4.3 D8) If a request is received to delete an IP * address which is also the source address of the IP packet * which contained the ASCONF chunk, the receiver MUST reject * this request. To reject the request the receiver MUST send * an Error Cause TLV set to the new error code 'Request to * Delete Source IP Address' */ if (sctp_cmp_addr_exact(&asconf->source, &addr)) return SCTP_ERROR_DEL_SRC_IP; /* Section 4.2.2 * If the address 0.0.0.0 or ::0 is provided, all * addresses of the peer except the source address of the * packet MUST be deleted. */ if (af->is_any(&addr)) { sctp_assoc_set_primary(asoc, asconf->transport); sctp_assoc_del_nonprimary_peers(asoc, asconf->transport); return SCTP_ERROR_NO_ERROR; } /* If the address is not part of the association, the * ASCONF-ACK with Error Cause Indication Parameter * which including cause of Unresolvable Address should * be sent. */ peer = sctp_assoc_lookup_paddr(asoc, &addr); if (!peer) return SCTP_ERROR_DNS_FAILED; sctp_assoc_rm_peer(asoc, peer); break; case SCTP_PARAM_SET_PRIMARY: /* ADDIP Section 4.2.4 * If the address 0.0.0.0 or ::0 is provided, the receiver * MAY mark the source address of the packet as its * primary. */ if (af->is_any(&addr)) memcpy(&addr, sctp_source(asconf), sizeof(addr)); if (security_sctp_bind_connect(asoc->ep->base.sk, SCTP_PARAM_SET_PRIMARY, (struct sockaddr *)&addr, af->sockaddr_len)) return SCTP_ERROR_REQ_REFUSED; peer = sctp_assoc_lookup_paddr(asoc, &addr); if (!peer) return SCTP_ERROR_DNS_FAILED; sctp_assoc_set_primary(asoc, peer); break; } return SCTP_ERROR_NO_ERROR; } /* Verify the ASCONF packet before we process it. */ bool sctp_verify_asconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, bool addr_param_needed, struct sctp_paramhdr **errp) { struct sctp_addip_chunk *addip; bool addr_param_seen = false; union sctp_params param; addip = (struct sctp_addip_chunk *)chunk->chunk_hdr; sctp_walk_params(param, addip) { size_t length = ntohs(param.p->length); *errp = param.p; switch (param.p->type) { case SCTP_PARAM_ERR_CAUSE: break; case SCTP_PARAM_IPV4_ADDRESS: if (length != sizeof(struct sctp_ipv4addr_param)) return false; /* ensure there is only one addr param and it's in the * beginning of addip_hdr params, or we reject it. */ if (param.v != (addip + 1)) return false; addr_param_seen = true; break; case SCTP_PARAM_IPV6_ADDRESS: if (length != sizeof(struct sctp_ipv6addr_param)) return false; if (param.v != (addip + 1)) return false; addr_param_seen = true; break; case SCTP_PARAM_ADD_IP: case SCTP_PARAM_DEL_IP: case SCTP_PARAM_SET_PRIMARY: /* In ASCONF chunks, these need to be first. */ if (addr_param_needed && !addr_param_seen) return false; length = ntohs(param.addip->param_hdr.length); if (length < sizeof(struct sctp_addip_param) + sizeof(**errp)) return false; break; case SCTP_PARAM_SUCCESS_REPORT: case SCTP_PARAM_ADAPTATION_LAYER_IND: if (length != sizeof(struct sctp_addip_param)) return false; break; default: /* This is unknown to us, reject! */ return false; } } /* Remaining sanity checks. */ if (addr_param_needed && !addr_param_seen) return false; if (!addr_param_needed && addr_param_seen) return false; if (param.v != chunk->chunk_end) return false; return true; } /* Process an incoming ASCONF chunk with the next expected serial no. and * return an ASCONF_ACK chunk to be sent in response. */ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, struct sctp_chunk *asconf) { union sctp_addr_param *addr_param; struct sctp_addip_chunk *addip; struct sctp_chunk *asconf_ack; bool all_param_pass = true; struct sctp_addiphdr *hdr; int length = 0, chunk_len; union sctp_params param; __be16 err_code; __u32 serial; addip = (struct sctp_addip_chunk *)asconf->chunk_hdr; chunk_len = ntohs(asconf->chunk_hdr->length) - sizeof(struct sctp_chunkhdr); hdr = (struct sctp_addiphdr *)asconf->skb->data; serial = ntohl(hdr->serial); /* Skip the addiphdr and store a pointer to address parameter. */ length = sizeof(*hdr); addr_param = (union sctp_addr_param *)(asconf->skb->data + length); chunk_len -= length; /* Skip the address parameter and store a pointer to the first * asconf parameter. */ length = ntohs(addr_param->p.length); chunk_len -= length; /* create an ASCONF_ACK chunk. * Based on the definitions of parameters, we know that the size of * ASCONF_ACK parameters are less than or equal to the fourfold of ASCONF * parameters. */ asconf_ack = sctp_make_asconf_ack(asoc, serial, chunk_len * 4); if (!asconf_ack) goto done; /* Process the TLVs contained within the ASCONF chunk. */ sctp_walk_params(param, addip) { /* Skip preceding address parameters. */ if (param.p->type == SCTP_PARAM_IPV4_ADDRESS || param.p->type == SCTP_PARAM_IPV6_ADDRESS) continue; err_code = sctp_process_asconf_param(asoc, asconf, param.addip); /* ADDIP 4.1 A7) * If an error response is received for a TLV parameter, * all TLVs with no response before the failed TLV are * considered successful if not reported. All TLVs after * the failed response are considered unsuccessful unless * a specific success indication is present for the parameter. */ if (err_code != SCTP_ERROR_NO_ERROR) all_param_pass = false; if (!all_param_pass) sctp_add_asconf_response(asconf_ack, param.addip->crr_id, err_code, param.addip); /* ADDIP 4.3 D11) When an endpoint receiving an ASCONF to add * an IP address sends an 'Out of Resource' in its response, it * MUST also fail any subsequent add or delete requests bundled * in the ASCONF. */ if (err_code == SCTP_ERROR_RSRC_LOW) goto done; } done: asoc->peer.addip_serial++; /* If we are sending a new ASCONF_ACK hold a reference to it in assoc * after freeing the reference to old asconf ack if any. */ if (asconf_ack) { sctp_chunk_hold(asconf_ack); list_add_tail(&asconf_ack->transmitted_list, &asoc->asconf_ack_list); } return asconf_ack; } /* Process a asconf parameter that is successfully acked. */ static void sctp_asconf_param_success(struct sctp_association *asoc, struct sctp_addip_param *asconf_param) { struct sctp_bind_addr *bp = &asoc->base.bind_addr; union sctp_addr_param *addr_param; struct sctp_sockaddr_entry *saddr; struct sctp_transport *transport; union sctp_addr addr; struct sctp_af *af; addr_param = (void *)asconf_param + sizeof(*asconf_param); /* We have checked the packet before, so we do not check again. */ af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (!af->from_addr_param(&addr, addr_param, htons(bp->port), 0)) return; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: /* This is always done in BH context with a socket lock * held, so the list can not change. */ local_bh_disable(); list_for_each_entry(saddr, &bp->address_list, list) { if (sctp_cmp_addr_exact(&saddr->a, &addr)) saddr->state = SCTP_ADDR_SRC; } local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { sctp_transport_dst_release(transport); } break; case SCTP_PARAM_DEL_IP: local_bh_disable(); sctp_del_bind_addr(bp, &addr); if (asoc->asconf_addr_del_pending != NULL && sctp_cmp_addr_exact(asoc->asconf_addr_del_pending, &addr)) { kfree(asoc->asconf_addr_del_pending); asoc->asconf_addr_del_pending = NULL; } local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { sctp_transport_dst_release(transport); } break; default: break; } } /* Get the corresponding ASCONF response error code from the ASCONF_ACK chunk * for the given asconf parameter. If there is no response for this parameter, * return the error code based on the third argument 'no_err'. * ADDIP 4.1 * A7) If an error response is received for a TLV parameter, all TLVs with no * response before the failed TLV are considered successful if not reported. * All TLVs after the failed response are considered unsuccessful unless a * specific success indication is present for the parameter. */ static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack, struct sctp_addip_param *asconf_param, int no_err) { struct sctp_addip_param *asconf_ack_param; struct sctp_errhdr *err_param; int asconf_ack_len; __be16 err_code; int length; if (no_err) err_code = SCTP_ERROR_NO_ERROR; else err_code = SCTP_ERROR_REQ_REFUSED; asconf_ack_len = ntohs(asconf_ack->chunk_hdr->length) - sizeof(struct sctp_chunkhdr); /* Skip the addiphdr from the asconf_ack chunk and store a pointer to * the first asconf_ack parameter. */ length = sizeof(struct sctp_addiphdr); asconf_ack_param = (struct sctp_addip_param *)(asconf_ack->skb->data + length); asconf_ack_len -= length; while (asconf_ack_len > 0) { if (asconf_ack_param->crr_id == asconf_param->crr_id) { switch (asconf_ack_param->param_hdr.type) { case SCTP_PARAM_SUCCESS_REPORT: return SCTP_ERROR_NO_ERROR; case SCTP_PARAM_ERR_CAUSE: length = sizeof(*asconf_ack_param); err_param = (void *)asconf_ack_param + length; asconf_ack_len -= length; if (asconf_ack_len > 0) return err_param->cause; else return SCTP_ERROR_INV_PARAM; break; default: return SCTP_ERROR_INV_PARAM; } } length = ntohs(asconf_ack_param->param_hdr.length); asconf_ack_param = (void *)asconf_ack_param + length; asconf_ack_len -= length; } return err_code; } /* Process an incoming ASCONF_ACK chunk against the cached last ASCONF chunk. */ int sctp_process_asconf_ack(struct sctp_association *asoc, struct sctp_chunk *asconf_ack) { struct sctp_chunk *asconf = asoc->addip_last_asconf; struct sctp_addip_param *asconf_param; __be16 err_code = SCTP_ERROR_NO_ERROR; union sctp_addr_param *addr_param; int asconf_len = asconf->skb->len; int all_param_pass = 0; int length = 0; int no_err = 1; int retval = 0; /* Skip the chunkhdr and addiphdr from the last asconf sent and store * a pointer to address parameter. */ length = sizeof(struct sctp_addip_chunk); addr_param = (union sctp_addr_param *)(asconf->skb->data + length); asconf_len -= length; /* Skip the address parameter in the last asconf sent and store a * pointer to the first asconf parameter. */ length = ntohs(addr_param->p.length); asconf_param = (void *)addr_param + length; asconf_len -= length; /* ADDIP 4.1 * A8) If there is no response(s) to specific TLV parameter(s), and no * failures are indicated, then all request(s) are considered * successful. */ if (asconf_ack->skb->len == sizeof(struct sctp_addiphdr)) all_param_pass = 1; /* Process the TLVs contained in the last sent ASCONF chunk. */ while (asconf_len > 0) { if (all_param_pass) err_code = SCTP_ERROR_NO_ERROR; else { err_code = sctp_get_asconf_response(asconf_ack, asconf_param, no_err); if (no_err && (SCTP_ERROR_NO_ERROR != err_code)) no_err = 0; } switch (err_code) { case SCTP_ERROR_NO_ERROR: sctp_asconf_param_success(asoc, asconf_param); break; case SCTP_ERROR_RSRC_LOW: retval = 1; break; case SCTP_ERROR_UNKNOWN_PARAM: /* Disable sending this type of asconf parameter in * future. */ asoc->peer.addip_disabled_mask |= asconf_param->param_hdr.type; break; case SCTP_ERROR_REQ_REFUSED: case SCTP_ERROR_DEL_LAST_IP: case SCTP_ERROR_DEL_SRC_IP: default: break; } /* Skip the processed asconf parameter and move to the next * one. */ length = ntohs(asconf_param->param_hdr.length); asconf_param = (void *)asconf_param + length; asconf_len -= length; } if (no_err && asoc->src_out_of_asoc_ok) { asoc->src_out_of_asoc_ok = 0; sctp_transport_immediate_rtx(asoc->peer.primary_path); } /* Free the cached last sent asconf chunk. */ list_del_init(&asconf->transmitted_list); sctp_chunk_free(asconf); asoc->addip_last_asconf = NULL; return retval; } /* Make a FWD TSN chunk. */ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_fwdtsn_skip *skiplist) { struct sctp_chunk *retval = NULL; struct sctp_fwdtsn_hdr ftsn_hdr; struct sctp_fwdtsn_skip skip; size_t hint; int i; hint = (nstreams + 1) * sizeof(__u32); retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint, GFP_ATOMIC); if (!retval) return NULL; ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); retval->subh.fwdtsn_hdr = sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); for (i = 0; i < nstreams; i++) { skip.stream = skiplist[i].stream; skip.ssn = skiplist[i].ssn; sctp_addto_chunk(retval, sizeof(skip), &skip); } return retval; } struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_ifwdtsn_skip *skiplist) { struct sctp_chunk *retval = NULL; struct sctp_ifwdtsn_hdr ftsn_hdr; size_t hint; hint = (nstreams + 1) * sizeof(__u32); retval = sctp_make_control(asoc, SCTP_CID_I_FWD_TSN, 0, hint, GFP_ATOMIC); if (!retval) return NULL; ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); retval->subh.ifwdtsn_hdr = sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); sctp_addto_chunk(retval, nstreams * sizeof(skiplist[0]), skiplist); return retval; } /* RE-CONFIG 3.1 (RE-CONFIG chunk) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 130 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Re-configuration Parameter / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Re-configuration Parameter (optional) / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ static struct sctp_chunk *sctp_make_reconf(const struct sctp_association *asoc, int length) { struct sctp_reconf_chunk *reconf; struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_RECONF, 0, length, GFP_ATOMIC); if (!retval) return NULL; reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr; retval->param_hdr.v = (u8 *)(reconf + 1); return retval; } /* RE-CONFIG 4.1 (STREAM OUT RESET) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 13 | Parameter Length = 16 + 2 * N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Sender's Last Assigned TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number 1 (optional) | Stream Number 2 (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / ...... / * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number N-1 (optional) | Stream Number N (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * RE-CONFIG 4.2 (STREAM IN RESET) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 14 | Parameter Length = 8 + 2 * N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number 1 (optional) | Stream Number 2 (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / ...... / * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number N-1 (optional) | Stream Number N (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_req( const struct sctp_association *asoc, __u16 stream_num, __be16 *stream_list, bool out, bool in) { __u16 stream_len = stream_num * sizeof(__u16); struct sctp_strreset_outreq outreq; struct sctp_strreset_inreq inreq; struct sctp_chunk *retval; __u16 outlen, inlen; outlen = (sizeof(outreq) + stream_len) * out; inlen = (sizeof(inreq) + stream_len) * in; retval = sctp_make_reconf(asoc, SCTP_PAD4(outlen) + SCTP_PAD4(inlen)); if (!retval) return NULL; if (outlen) { outreq.param_hdr.type = SCTP_PARAM_RESET_OUT_REQUEST; outreq.param_hdr.length = htons(outlen); outreq.request_seq = htonl(asoc->strreset_outseq); outreq.response_seq = htonl(asoc->strreset_inseq - 1); outreq.send_reset_at_tsn = htonl(asoc->next_tsn - 1); sctp_addto_chunk(retval, sizeof(outreq), &outreq); if (stream_len) sctp_addto_chunk(retval, stream_len, stream_list); } if (inlen) { inreq.param_hdr.type = SCTP_PARAM_RESET_IN_REQUEST; inreq.param_hdr.length = htons(inlen); inreq.request_seq = htonl(asoc->strreset_outseq + out); sctp_addto_chunk(retval, sizeof(inreq), &inreq); if (stream_len) sctp_addto_chunk(retval, stream_len, stream_list); } return retval; } /* RE-CONFIG 4.3 (SSN/TSN RESET ALL) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 15 | Parameter Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_tsnreq( const struct sctp_association *asoc) { struct sctp_strreset_tsnreq tsnreq; __u16 length = sizeof(tsnreq); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; tsnreq.param_hdr.type = SCTP_PARAM_RESET_TSN_REQUEST; tsnreq.param_hdr.length = htons(length); tsnreq.request_seq = htonl(asoc->strreset_outseq); sctp_addto_chunk(retval, sizeof(tsnreq), &tsnreq); return retval; } /* RE-CONFIG 4.5/4.6 (ADD STREAM) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 17 | Parameter Length = 12 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Number of new streams | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_addstrm( const struct sctp_association *asoc, __u16 out, __u16 in) { struct sctp_strreset_addstrm addstrm; __u16 size = sizeof(addstrm); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, (!!out + !!in) * size); if (!retval) return NULL; if (out) { addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_OUT_STREAMS; addstrm.param_hdr.length = htons(size); addstrm.number_of_streams = htons(out); addstrm.request_seq = htonl(asoc->strreset_outseq); addstrm.reserved = 0; sctp_addto_chunk(retval, size, &addstrm); } if (in) { addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_IN_STREAMS; addstrm.param_hdr.length = htons(size); addstrm.number_of_streams = htons(in); addstrm.request_seq = htonl(asoc->strreset_outseq + !!out); addstrm.reserved = 0; sctp_addto_chunk(retval, size, &addstrm); } return retval; } /* RE-CONFIG 4.4 (RESP) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 16 | Parameter Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Result | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_resp(const struct sctp_association *asoc, __u32 result, __u32 sn) { struct sctp_strreset_resp resp; __u16 length = sizeof(resp); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; resp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE; resp.param_hdr.length = htons(length); resp.response_seq = htonl(sn); resp.result = htonl(result); sctp_addto_chunk(retval, sizeof(resp), &resp); return retval; } /* RE-CONFIG 4.4 OPTIONAL (TSNRESP) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 16 | Parameter Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Result | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Sender's Next TSN (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Receiver's Next TSN (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_tsnresp(struct sctp_association *asoc, __u32 result, __u32 sn, __u32 sender_tsn, __u32 receiver_tsn) { struct sctp_strreset_resptsn tsnresp; __u16 length = sizeof(tsnresp); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; tsnresp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE; tsnresp.param_hdr.length = htons(length); tsnresp.response_seq = htonl(sn); tsnresp.result = htonl(result); tsnresp.senders_next_tsn = htonl(sender_tsn); tsnresp.receivers_next_tsn = htonl(receiver_tsn); sctp_addto_chunk(retval, sizeof(tsnresp), &tsnresp); return retval; } bool sctp_verify_reconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_paramhdr **errp) { struct sctp_reconf_chunk *hdr; union sctp_params param; __be16 last = 0; __u16 cnt = 0; hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; sctp_walk_params(param, hdr) { __u16 length = ntohs(param.p->length); *errp = param.p; if (cnt++ > 2) return false; switch (param.p->type) { case SCTP_PARAM_RESET_OUT_REQUEST: if (length < sizeof(struct sctp_strreset_outreq) || (last && last != SCTP_PARAM_RESET_RESPONSE && last != SCTP_PARAM_RESET_IN_REQUEST)) return false; break; case SCTP_PARAM_RESET_IN_REQUEST: if (length < sizeof(struct sctp_strreset_inreq) || (last && last != SCTP_PARAM_RESET_OUT_REQUEST)) return false; break; case SCTP_PARAM_RESET_RESPONSE: if ((length != sizeof(struct sctp_strreset_resp) && length != sizeof(struct sctp_strreset_resptsn)) || (last && last != SCTP_PARAM_RESET_RESPONSE && last != SCTP_PARAM_RESET_OUT_REQUEST)) return false; break; case SCTP_PARAM_RESET_TSN_REQUEST: if (length != sizeof(struct sctp_strreset_tsnreq) || last) return false; break; case SCTP_PARAM_RESET_ADD_IN_STREAMS: if (length != sizeof(struct sctp_strreset_addstrm) || (last && last != SCTP_PARAM_RESET_ADD_OUT_STREAMS)) return false; break; case SCTP_PARAM_RESET_ADD_OUT_STREAMS: if (length != sizeof(struct sctp_strreset_addstrm) || (last && last != SCTP_PARAM_RESET_ADD_IN_STREAMS)) return false; break; default: return false; } last = param.p->type; } return true; }
2805 2963 2948 15 1247 457 1514 166 1350 309 845 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 #ifndef _LINUX_JHASH_H #define _LINUX_JHASH_H /* jhash.h: Jenkins hash support. * * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) * * https://burtleburtle.net/bob/hash/ * * These are the credits from Bob's sources: * * lookup3.c, by Bob Jenkins, May 2006, Public Domain. * * These are functions for producing 32-bit hashes for hash table lookup. * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() * are externally useful functions. Routines to test the hash are included * if SELF_TEST is defined. You can use this free for any purpose. It's in * the public domain. It has no warranty. * * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org) * * I've modified Bob's hash to be useful in the Linux kernel, and * any bugs present are my fault. * Jozsef */ #include <linux/bitops.h> #include <linux/unaligned.h> /* Best hash sizes are of power of two */ #define jhash_size(n) ((u32)1<<(n)) /* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */ #define jhash_mask(n) (jhash_size(n)-1) /* __jhash_mix - mix 3 32-bit values reversibly. */ #define __jhash_mix(a, b, c) \ { \ a -= c; a ^= rol32(c, 4); c += b; \ b -= a; b ^= rol32(a, 6); a += c; \ c -= b; c ^= rol32(b, 8); b += a; \ a -= c; a ^= rol32(c, 16); c += b; \ b -= a; b ^= rol32(a, 19); a += c; \ c -= b; c ^= rol32(b, 4); b += a; \ } /* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */ #define __jhash_final(a, b, c) \ { \ c ^= b; c -= rol32(b, 14); \ a ^= c; a -= rol32(c, 11); \ b ^= a; b -= rol32(a, 25); \ c ^= b; c -= rol32(b, 16); \ a ^= c; a -= rol32(c, 4); \ b ^= a; b -= rol32(a, 14); \ c ^= b; c -= rol32(b, 24); \ } /* An arbitrary initial parameter */ #define JHASH_INITVAL 0xdeadbeef /* jhash - hash an arbitrary key * @k: sequence of bytes as key * @length: the length of the key * @initval: the previous hash, or an arbitrary value * * The generic version, hashes an arbitrary sequence of bytes. * No alignment or length assumptions are made about the input key. * * Returns the hash value of the key. The result depends on endianness. */ static inline u32 jhash(const void *key, u32 length, u32 initval) { u32 a, b, c; const u8 *k = key; /* Set up the internal state */ a = b = c = JHASH_INITVAL + length + initval; /* All but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { a += get_unaligned((u32 *)k); b += get_unaligned((u32 *)(k + 4)); c += get_unaligned((u32 *)(k + 8)); __jhash_mix(a, b, c); length -= 12; k += 12; } /* Last block: affect all 32 bits of (c) */ switch (length) { case 12: c += (u32)k[11]<<24; fallthrough; case 11: c += (u32)k[10]<<16; fallthrough; case 10: c += (u32)k[9]<<8; fallthrough; case 9: c += k[8]; fallthrough; case 8: b += (u32)k[7]<<24; fallthrough; case 7: b += (u32)k[6]<<16; fallthrough; case 6: b += (u32)k[5]<<8; fallthrough; case 5: b += k[4]; fallthrough; case 4: a += (u32)k[3]<<24; fallthrough; case 3: a += (u32)k[2]<<16; fallthrough; case 2: a += (u32)k[1]<<8; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* jhash2 - hash an array of u32's * @k: the key which must be an array of u32's * @length: the number of u32's in the key * @initval: the previous hash, or an arbitrary value * * Returns the hash value of the key. */ static inline u32 jhash2(const u32 *k, u32 length, u32 initval) { u32 a, b, c; /* Set up the internal state */ a = b = c = JHASH_INITVAL + (length<<2) + initval; /* Handle most of the key */ while (length > 3) { a += k[0]; b += k[1]; c += k[2]; __jhash_mix(a, b, c); length -= 3; k += 3; } /* Handle the last 3 u32's */ switch (length) { case 3: c += k[2]; fallthrough; case 2: b += k[1]; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */ static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) { a += initval; b += initval; c += initval; __jhash_final(a, b, c); return c; } static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval) { return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2)); } static inline u32 jhash_2words(u32 a, u32 b, u32 initval) { return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); } static inline u32 jhash_1word(u32 a, u32 initval) { return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2)); } #endif /* _LINUX_JHASH_H */
4452 4452 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/pm_qos.h> static inline void device_pm_init_common(struct device *dev) { if (!dev->power.early_init) { spin_lock_init(&dev->power.lock); dev->power.qos = NULL; dev->power.early_init = true; } } #ifdef CONFIG_PM static inline void pm_runtime_early_init(struct device *dev) { dev->power.disable_depth = 1; device_pm_init_common(dev); } extern void pm_runtime_init(struct device *dev); extern void pm_runtime_reinit(struct device *dev); extern void pm_runtime_remove(struct device *dev); extern u64 pm_runtime_active_time(struct device *dev); #define WAKE_IRQ_DEDICATED_ALLOCATED BIT(0) #define WAKE_IRQ_DEDICATED_MANAGED BIT(1) #define WAKE_IRQ_DEDICATED_REVERSE BIT(2) #define WAKE_IRQ_DEDICATED_MASK (WAKE_IRQ_DEDICATED_ALLOCATED | \ WAKE_IRQ_DEDICATED_MANAGED | \ WAKE_IRQ_DEDICATED_REVERSE) #define WAKE_IRQ_DEDICATED_ENABLED BIT(3) struct wake_irq { struct device *dev; unsigned int status; int irq; const char *name; }; extern void dev_pm_arm_wake_irq(struct wake_irq *wirq); extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq); extern void dev_pm_enable_wake_irq_check(struct device *dev, bool can_change_status); extern void dev_pm_disable_wake_irq_check(struct device *dev, bool cond_disable); extern void dev_pm_enable_wake_irq_complete(struct device *dev); #ifdef CONFIG_PM_SLEEP extern void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq); extern void device_wakeup_detach_irq(struct device *dev); extern void device_wakeup_arm_wake_irqs(void); extern void device_wakeup_disarm_wake_irqs(void); #else static inline void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq) {} static inline void device_wakeup_detach_irq(struct device *dev) { } #endif /* CONFIG_PM_SLEEP */ /* * sysfs.c */ extern int dpm_sysfs_add(struct device *dev); extern void dpm_sysfs_remove(struct device *dev); extern void rpm_sysfs_remove(struct device *dev); extern int wakeup_sysfs_add(struct device *dev); extern void wakeup_sysfs_remove(struct device *dev); extern int pm_qos_sysfs_add_resume_latency(struct device *dev); extern void pm_qos_sysfs_remove_resume_latency(struct device *dev); extern int pm_qos_sysfs_add_flags(struct device *dev); extern void pm_qos_sysfs_remove_flags(struct device *dev); extern int pm_qos_sysfs_add_latency_tolerance(struct device *dev); extern void pm_qos_sysfs_remove_latency_tolerance(struct device *dev); extern int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid); #else /* CONFIG_PM */ static inline void pm_runtime_early_init(struct device *dev) { device_pm_init_common(dev); } static inline void pm_runtime_init(struct device *dev) {} static inline void pm_runtime_reinit(struct device *dev) {} static inline void pm_runtime_remove(struct device *dev) {} static inline int dpm_sysfs_add(struct device *dev) { return 0; } static inline void dpm_sysfs_remove(struct device *dev) {} static inline int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid) { return 0; } #endif #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ extern int pm_async_enabled; /* drivers/base/power/main.c */ extern struct list_head dpm_list; /* The active device list */ static inline struct device *to_device(struct list_head *entry) { return container_of(entry, struct device, power.entry); } extern void device_pm_sleep_init(struct device *dev); extern void device_pm_add(struct device *); extern void device_pm_remove(struct device *); extern void device_pm_move_before(struct device *, struct device *); extern void device_pm_move_after(struct device *, struct device *); extern void device_pm_move_last(struct device *); extern void device_pm_check_callbacks(struct device *dev); static inline bool device_pm_initialized(struct device *dev) { return dev->power.in_dpm_list; } /* drivers/base/power/wakeup_stats.c */ extern int wakeup_source_sysfs_add(struct device *parent, struct wakeup_source *ws); extern void wakeup_source_sysfs_remove(struct wakeup_source *ws); extern int pm_wakeup_source_sysfs_add(struct device *parent); #else /* !CONFIG_PM_SLEEP */ static inline void device_pm_sleep_init(struct device *dev) {} static inline void device_pm_add(struct device *dev) {} static inline void device_pm_remove(struct device *dev) { pm_runtime_remove(dev); } static inline void device_pm_move_before(struct device *deva, struct device *devb) {} static inline void device_pm_move_after(struct device *deva, struct device *devb) {} static inline void device_pm_move_last(struct device *dev) {} static inline void device_pm_check_callbacks(struct device *dev) {} static inline bool device_pm_initialized(struct device *dev) { return device_is_registered(dev); } static inline int pm_wakeup_source_sysfs_add(struct device *parent) { return 0; } #endif /* !CONFIG_PM_SLEEP */ static inline void device_pm_init(struct device *dev) { device_pm_init_common(dev); device_pm_sleep_init(dev); pm_runtime_init(dev); }
1 18 1481 168 126 172 13 53 5 494 40 494 20691 1057 27 518 495 493 13 331 8 8 8 243 242 243 857 837 21 838 9 9 533 24 28 28 334 869 76 74 54 63 124 24548 9 89 284 100 99 44 4 101 101 715 717 718 46 78 528 240 58 27 84 29 197 115 71 178 7 171 4 26 266 316 217 533 675 41 675 835 289 311 19013 221 13750 676 2148 9 222 20632 1 113 113 113 99 229 19407 198 2 27 295 86 3 20 86 315 867 184 35 32 29 57 47 37 2 37 4 240 2 1 4 4 3 6 6 195 284 1515 184 260 441 17 177 177 173 24 1 109 502 3828 657 7 1374 1373 78 78 10 10 10 3301 80 323 36 404 30 122 306 22 19065 56 717 235 10 1636 16731 276 2 806 4 362 4 75 18 65 158 39 25 95 298 1132 13 11 722 3 442 32 916 6 19 417 34 4 14729 14728 8 13 60 50 13 238 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the AF_INET socket handler. * * Version: @(#)sock.h 1.0.4 05/13/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche <flla@stud.uni-sb.de> * * Fixes: * Alan Cox : Volatiles in skbuff pointers. See * skbuff comments. May be overdone, * better to prove they can be removed * than the reverse. * Alan Cox : Added a zapped field for tcp to note * a socket is reset and must stay shut up * Alan Cox : New fields for options * Pauline Middelink : identd support * Alan Cox : Eliminate low level recv/recvfrom * David S. Miller : New socket lookup architecture. * Steve Whitehouse: Default routines for sock_ops * Arnaldo C. Melo : removed net_pinfo, tp_pinfo and made * protinfo be just a void pointer, as the * protocol specific parts were moved to * respective headers and ipv4/v6, etc now * use private slabcaches for its socks * Pedro Hortas : New flags field for socket options */ #ifndef _SOCK_H #define _SOCK_H #include <linux/hardirq.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/list_nulls.h> #include <linux/timer.h> #include <linux/cache.h> #include <linux/bitops.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/skbuff.h> /* struct sk_buff */ #include <linux/mm.h> #include <linux/security.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/static_key.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/cgroup-defs.h> #include <linux/rbtree.h> #include <linux/rculist_nulls.h> #include <linux/poll.h> #include <linux/sockptr.h> #include <linux/indirect_call_wrapper.h> #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/llist.h> #include <net/dst.h> #include <net/checksum.h> #include <net/tcp_states.h> #include <linux/net_tstamp.h> #include <net/l3mdev.h> #include <uapi/linux/socket.h> /* * This structure really needs to be cleaned up. * Most of it is for TCP, and not used by any of * the other protocols. */ /* This is the per-socket lock. The spinlock provides a synchronization * between user contexts and software interrupt processing, whereas the * mini-semaphore synchronizes multiple users amongst themselves. */ typedef struct { spinlock_t slock; int owned; wait_queue_head_t wq; /* * We express the mutex-alike socket_lock semantics * to the lock validator by explicitly managing * the slock as a lock variant (in addition to * the slock itself): */ #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif } socket_lock_t; struct sock; struct proto; struct net; typedef __u32 __bitwise __portpair; typedef __u64 __bitwise __addrpair; /** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr * @skc_rcv_saddr: Bound local IPv4 addr * @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr * @skc_hash: hash value used with various protocol lookup tables * @skc_u16hashes: two u16 hash values used by UDP lookup tables * @skc_dport: placeholder for inet_dport/tw_dport * @skc_num: placeholder for inet_num/tw_num * @skc_portpair: __u32 union of @skc_dport & @skc_num * @skc_family: network address family * @skc_state: Connection state * @skc_reuse: %SO_REUSEADDR setting * @skc_reuseport: %SO_REUSEPORT setting * @skc_ipv6only: socket is IPV6 only * @skc_net_refcnt: socket is using net ref counting * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol * @skc_prot: protocol handlers inside a network family * @skc_net: reference to the network namespace of this socket * @skc_v6_daddr: IPV6 destination address * @skc_v6_rcv_saddr: IPV6 source address * @skc_cookie: socket's cookie value * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection * @skc_rx_queue_mapping: rx queue number for this connection * @skc_flags: place holder for sk_flags * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings * @skc_listener: connection request listener socket (aka rsk_listener) * [union with @skc_flags] * @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row * [union with @skc_flags] * @skc_incoming_cpu: record/match cpu processing incoming packets * @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled) * [union with @skc_incoming_cpu] * @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number * [union with @skc_incoming_cpu] * @skc_refcnt: reference count * * This is the minimal network layer representation of sockets, the header * for struct sock and struct inet_timewait_sock. */ struct sock_common { union { __addrpair skc_addrpair; struct { __be32 skc_daddr; __be32 skc_rcv_saddr; }; }; union { unsigned int skc_hash; __u16 skc_u16hashes[2]; }; /* skc_dport && skc_num must be grouped as well */ union { __portpair skc_portpair; struct { __be16 skc_dport; __u16 skc_num; }; }; unsigned short skc_family; volatile unsigned char skc_state; unsigned char skc_reuse:4; unsigned char skc_reuseport:1; unsigned char skc_ipv6only:1; unsigned char skc_net_refcnt:1; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; struct hlist_node skc_portaddr_node; }; struct proto *skc_prot; possible_net_t skc_net; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr skc_v6_daddr; struct in6_addr skc_v6_rcv_saddr; #endif atomic64_t skc_cookie; /* following fields are padding to force * offset(struct sock, sk_refcnt) == 128 on 64bit arches * assuming IPV6 is enabled. We use this padding differently * for different kind of 'sockets' */ union { unsigned long skc_flags; struct sock *skc_listener; /* request_sock */ struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */ }; /* * fields between dontcopy_begin/dontcopy_end * are not copied in sock_copy() */ /* private: */ int skc_dontcopy_begin[0]; /* public: */ union { struct hlist_node skc_node; struct hlist_nulls_node skc_nulls_node; }; unsigned short skc_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING unsigned short skc_rx_queue_mapping; #endif union { int skc_incoming_cpu; u32 skc_rcv_wnd; u32 skc_tw_rcv_nxt; /* struct tcp_timewait_sock */ }; refcount_t skc_refcnt; /* private: */ int skc_dontcopy_end[0]; union { u32 skc_rxhash; u32 skc_window_clamp; u32 skc_tw_snd_nxt; /* struct tcp_timewait_sock */ }; /* public: */ }; struct bpf_local_storage; struct sk_filter; /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer * @sk_kern_sock: True if sock is using kernel lock classes * @sk_rcvbuf: size of receive buffer in bytes * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux * @sk_rx_dst_ifindex: ifindex for @sk_rx_dst * @sk_rx_dst_cookie: cookie for @sk_rx_dst * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags * @sk_write_queue: Packet sending queue * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward * @sk_reserved_mem: space reserved and non-reclaimable for the socket * @sk_napi_id: id of the last napi context to receive data for sk * @sk_ll_usec: usecs to busypoll when there is no data * @sk_allocation: allocation mode * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) * @sk_pacing_status: Pacing status (requested, handled by sch_fq) * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) * @sk_sndbuf: size of send buffer in bytes * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_gso_disabled: if set, NETIF_F_GSO_MASK is forbidden. * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_gso_max_segs: Maximum number of GSO segments * @sk_pacing_shift: scaling factor for TCP Small Queues * @sk_lingertime: %SO_LINGER l_linger setting * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, * IPV6_ADDRFORM for instance) * @sk_err: last error * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner * @sk_ino: inode number (zero if orphaned) * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family * @sk_peer_lock: lock protecting @sk_peer_pid and @sk_peer_cred * @sk_peer_pid: &struct pid for this socket's peer * @sk_peer_cred: %SO_PEERCRED setting * @sk_rcvlowat: %SO_RCVLOWAT setting * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_txhash: computed flow hash for use on transmit * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING flags * @sk_bpf_cb_flags: used in bpf_setsockopt() * @sk_use_task_frag: allow sk_page_frag() to use current->task_frag. * Sockets that can be used under memory reclaim should * set this to false. * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock * for timestamping * @sk_tskey: counter to disambiguate concurrent tstamp requests * @sk_zckey: counter to order MSG_ZEROCOPY notifications * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock. * @sk_frag: cached page frag * @sk_peek_off: current peek_offset value * @sk_send_head: front of stuff to transmit * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head] * @sk_security: used by security modules * @sk_mark: generic packet mark * @sk_cgrp_data: cgroup data for this cgroup * @sk_memcg: this socket's memory cgroup association * @sk_write_pending: a write to stream socket waits to start * @sk_disconnects: number of disconnect operations performed on this sock * @sk_state_change: callback to indicate change in the state of the sock * @sk_data_ready: callback to indicate there is data to be processed * @sk_write_space: callback to indicate there is bf sending space available * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) * @sk_backlog_rcv: callback to process the backlog * @sk_validate_xmit_skb: ptr to an optional validate function * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 * @sk_reuseport_cb: reuseport group container * @sk_bpf_storage: ptr to cache and control for bpf_sk_storage * @sk_rcu: used during RCU grace period * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags * @sk_scm_recv_flags: all flags used by scm_recv() * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD * @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS * @sk_scm_unused: unused flags for scm_recv() * @ns_tracker: tracker for netns reference * @sk_user_frags: xarray of pages the user is holding a reference on. * @sk_owner: reference to the real owner of the socket that calls * sock_lock_init_class_and_name(). */ struct sock { /* * Now struct inet_timewait_sock also uses sock_common, so please just * don't add nothing before this first member (__sk_common) --acme */ struct sock_common __sk_common; #define sk_node __sk_common.skc_node #define sk_nulls_node __sk_common.skc_nulls_node #define sk_refcnt __sk_common.skc_refcnt #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING #define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping #endif #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin #define sk_dontcopy_end __sk_common.skc_dontcopy_end #define sk_hash __sk_common.skc_hash #define sk_portpair __sk_common.skc_portpair #define sk_num __sk_common.skc_num #define sk_dport __sk_common.skc_dport #define sk_addrpair __sk_common.skc_addrpair #define sk_daddr __sk_common.skc_daddr #define sk_rcv_saddr __sk_common.skc_rcv_saddr #define sk_family __sk_common.skc_family #define sk_state __sk_common.skc_state #define sk_reuse __sk_common.skc_reuse #define sk_reuseport __sk_common.skc_reuseport #define sk_ipv6only __sk_common.skc_ipv6only #define sk_net_refcnt __sk_common.skc_net_refcnt #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot #define sk_net __sk_common.skc_net #define sk_v6_daddr __sk_common.skc_v6_daddr #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr #define sk_cookie __sk_common.skc_cookie #define sk_incoming_cpu __sk_common.skc_incoming_cpu #define sk_flags __sk_common.skc_flags #define sk_rxhash __sk_common.skc_rxhash __cacheline_group_begin(sock_write_rx); atomic_t sk_drops; __s32 sk_peek_off; struct sk_buff_head sk_error_queue; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with * the per-socket spinlock held and requires low latency * access. Therefore we special case it's implementation. * Note : rmem_alloc is in this structure to fill a hole * on 64bit arches, not because its logically part of * backlog. */ struct { atomic_t rmem_alloc; int len; struct sk_buff *head; struct sk_buff *tail; } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc __cacheline_group_end(sock_write_rx); __cacheline_group_begin(sock_read_rx); /* early demux fields */ struct dst_entry __rcu *sk_rx_dst; int sk_rx_dst_ifindex; u32 sk_rx_dst_cookie; #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sk_ll_usec; unsigned int sk_napi_id; u16 sk_busy_poll_budget; u8 sk_prefer_busy_poll; #endif u8 sk_userlocks; int sk_rcvbuf; struct sk_filter __rcu *sk_filter; union { struct socket_wq __rcu *sk_wq; /* private: */ struct socket_wq *sk_wq_raw; /* public: */ }; void (*sk_data_ready)(struct sock *sk); long sk_rcvtimeo; int sk_rcvlowat; __cacheline_group_end(sock_read_rx); __cacheline_group_begin(sock_read_rxtx); int sk_err; struct socket *sk_socket; struct mem_cgroup *sk_memcg; #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); socket_lock_t sk_lock; u32 sk_reserved_mem; int sk_forward_alloc; u32 sk_tsflags; __cacheline_group_end(sock_write_rxtx); __cacheline_group_begin(sock_write_tx); int sk_write_pending; atomic_t sk_omem_alloc; int sk_sndbuf; int sk_wmem_queued; refcount_t sk_wmem_alloc; unsigned long sk_tsq_flags; union { struct sk_buff *sk_send_head; struct rb_root tcp_rtx_queue; }; struct sk_buff_head sk_write_queue; u32 sk_dst_pending_confirm; u32 sk_pacing_status; /* see enum sk_pacing */ struct page_frag sk_frag; struct timer_list sk_timer; unsigned long sk_pacing_rate; /* bytes per second */ atomic_t sk_zckey; atomic_t sk_tskey; __cacheline_group_end(sock_write_tx); __cacheline_group_begin(sock_read_tx); unsigned long sk_max_pacing_rate; long sk_sndtimeo; u32 sk_priority; u32 sk_mark; struct dst_entry __rcu *sk_dst_cache; netdev_features_t sk_route_caps; #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk, struct net_device *dev, struct sk_buff *skb); #endif u16 sk_gso_type; u16 sk_gso_max_segs; unsigned int sk_gso_max_size; gfp_t sk_allocation; u32 sk_txhash; u8 sk_pacing_shift; bool sk_use_task_frag; __cacheline_group_end(sock_read_tx); /* * Because of non atomicity rules, all * changes are protected by socket lock. */ u8 sk_gso_disabled : 1, sk_kern_sock : 1, sk_no_check_tx : 1, sk_no_check_rx : 1; u8 sk_shutdown; u16 sk_type; u16 sk_protocol; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; unsigned long sk_ino; spinlock_t sk_peer_lock; int sk_bind_phc; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; ktime_t sk_stamp; #if BITS_PER_LONG==32 seqlock_t sk_stamp_seq; #endif int sk_disconnects; union { u8 sk_txrehash; u8 sk_scm_recv_flags; struct { u8 sk_scm_credentials : 1, sk_scm_security : 1, sk_scm_pidfd : 1, sk_scm_rights : 1, sk_scm_unused : 4; }; }; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, sk_txtime_unused : 6; #define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG)) u8 sk_bpf_cb_flags; void *sk_user_data; #ifdef CONFIG_SECURITY void *sk_security; #endif struct sock_cgroup_data sk_cgrp_data; void (*sk_state_change)(struct sock *sk); void (*sk_write_space)(struct sock *sk); void (*sk_error_report)(struct sock *sk); int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *sk_bpf_storage; #endif struct rcu_head sk_rcu; netns_tracker ns_tracker; struct xarray sk_user_frags; #if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) struct module *sk_owner; #endif }; struct sock_bh_locked { struct sock *sock; local_lock_t bh_lock; }; enum sk_pacing { SK_PACING_NONE = 0, SK_PACING_NEEDED = 1, SK_PACING_FQ = 2, }; /* flag bits in sk_user_data * * - SK_USER_DATA_NOCOPY: Pointer stored in sk_user_data might * not be suitable for copying when cloning the socket. For instance, * it can point to a reference counted object. sk_user_data bottom * bit is set if pointer must not be copied. * * - SK_USER_DATA_BPF: Mark whether sk_user_data field is * managed/owned by a BPF reuseport array. This bit should be set * when sk_user_data's sk is added to the bpf's reuseport_array. * * - SK_USER_DATA_PSOCK: Mark whether pointer stored in * sk_user_data points to psock type. This bit should be set * when sk_user_data is assigned to a psock object. */ #define SK_USER_DATA_NOCOPY 1UL #define SK_USER_DATA_BPF 2UL #define SK_USER_DATA_PSOCK 4UL #define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\ SK_USER_DATA_PSOCK) /** * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied * @sk: socket */ static inline bool sk_user_data_is_nocopy(const struct sock *sk) { return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY); } #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) /** * __locked_read_sk_user_data_with_flags - return the pointer * only if argument flags all has been set in sk_user_data. Otherwise * return NULL * * @sk: socket * @flags: flag bits * * The caller must be holding sk->sk_callback_lock. */ static inline void * __locked_read_sk_user_data_with_flags(const struct sock *sk, uintptr_t flags) { uintptr_t sk_user_data = (uintptr_t)rcu_dereference_check(__sk_user_data(sk), lockdep_is_held(&sk->sk_callback_lock)); WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); if ((sk_user_data & flags) == flags) return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); return NULL; } /** * __rcu_dereference_sk_user_data_with_flags - return the pointer * only if argument flags all has been set in sk_user_data. Otherwise * return NULL * * @sk: socket * @flags: flag bits */ static inline void * __rcu_dereference_sk_user_data_with_flags(const struct sock *sk, uintptr_t flags) { uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk)); WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); if ((sk_user_data & flags) == flags) return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); return NULL; } #define rcu_dereference_sk_user_data(sk) \ __rcu_dereference_sk_user_data_with_flags(sk, 0) #define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags) \ ({ \ uintptr_t __tmp1 = (uintptr_t)(ptr), \ __tmp2 = (uintptr_t)(flags); \ WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK); \ WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK); \ rcu_assign_pointer(__sk_user_data((sk)), \ __tmp1 | __tmp2); \ }) #define rcu_assign_sk_user_data(sk, ptr) \ __rcu_assign_sk_user_data_with_flags(sk, ptr, 0) static inline struct net *sock_net(const struct sock *sk) { return read_pnet(&sk->sk_net); } static inline void sock_net_set(struct sock *sk, struct net *net) { write_pnet(&sk->sk_net, net); } /* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK * or not whether his port will be reused by someone else. SK_FORCE_REUSE * on a socket means that the socket will reuse everybody else's port * without looking at the other's sk_reuse value. */ #define SK_NO_REUSE 0 #define SK_CAN_REUSE 1 #define SK_FORCE_REUSE 2 int sk_set_peek_off(struct sock *sk, int val); static inline int sk_peek_offset(const struct sock *sk, int flags) { if (unlikely(flags & MSG_PEEK)) { return READ_ONCE(sk->sk_peek_off); } return 0; } static inline void sk_peek_offset_bwd(struct sock *sk, int val) { s32 off = READ_ONCE(sk->sk_peek_off); if (unlikely(off >= 0)) { off = max_t(s32, off - val, 0); WRITE_ONCE(sk->sk_peek_off, off); } } static inline void sk_peek_offset_fwd(struct sock *sk, int val) { sk_peek_offset_bwd(sk, -val); } /* * Hashed lists helper routines */ static inline struct sock *sk_entry(const struct hlist_node *node) { return hlist_entry(node, struct sock, sk_node); } static inline struct sock *__sk_head(const struct hlist_head *head) { return hlist_entry(head->first, struct sock, sk_node); } static inline struct sock *sk_head(const struct hlist_head *head) { return hlist_empty(head) ? NULL : __sk_head(head); } static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head) { return hlist_nulls_entry(head->first, struct sock, sk_nulls_node); } static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head) { return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head); } static inline struct sock *sk_next(const struct sock *sk) { return hlist_entry_safe(sk->sk_node.next, struct sock, sk_node); } static inline struct sock *sk_nulls_next(const struct sock *sk) { return (!is_a_nulls(sk->sk_nulls_node.next)) ? hlist_nulls_entry(sk->sk_nulls_node.next, struct sock, sk_nulls_node) : NULL; } static inline bool sk_unhashed(const struct sock *sk) { return hlist_unhashed(&sk->sk_node); } static inline bool sk_hashed(const struct sock *sk) { return !sk_unhashed(sk); } static inline void sk_node_init(struct hlist_node *node) { node->pprev = NULL; } static inline void __sk_del_node(struct sock *sk) { __hlist_del(&sk->sk_node); } /* NB: equivalent to hlist_del_init_rcu */ static inline bool __sk_del_node_init(struct sock *sk) { if (sk_hashed(sk)) { __sk_del_node(sk); sk_node_init(&sk->sk_node); return true; } return false; } /* Grab socket reference count. This operation is valid only when sk is ALREADY grabbed f.e. it is found in hash table or a list and the lookup is made under lock preventing hash table modifications. */ static __always_inline void sock_hold(struct sock *sk) { refcount_inc(&sk->sk_refcnt); } /* Ungrab socket in the context, which assumes that socket refcnt cannot hit zero, f.e. it is true in context of any socketcall. */ static __always_inline void __sock_put(struct sock *sk) { refcount_dec(&sk->sk_refcnt); } static inline bool sk_del_node_init(struct sock *sk) { bool rc = __sk_del_node_init(sk); if (rc) { /* paranoid for a while -acme */ WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } return rc; } #define sk_del_node_init_rcu(sk) sk_del_node_init(sk) static inline bool __sk_nulls_del_node_init_rcu(struct sock *sk) { if (sk_hashed(sk)) { hlist_nulls_del_init_rcu(&sk->sk_nulls_node); return true; } return false; } static inline bool sk_nulls_del_node_init_rcu(struct sock *sk) { bool rc = __sk_nulls_del_node_init_rcu(sk); if (rc) { /* paranoid for a while -acme */ WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } return rc; } static inline void __sk_add_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_node, list); } static inline void sk_add_node(struct sock *sk, struct hlist_head *list) { sock_hold(sk); __sk_add_node(sk, list); } static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) { sock_hold(sk); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) hlist_add_tail_rcu(&sk->sk_node, list); else hlist_add_head_rcu(&sk->sk_node, list); } static inline void sk_add_node_tail_rcu(struct sock *sk, struct hlist_head *list) { sock_hold(sk); hlist_add_tail_rcu(&sk->sk_node, list); } static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); } static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list) { hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); } static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { sock_hold(sk); __sk_nulls_add_node_rcu(sk, list); } static inline void __sk_del_bind_node(struct sock *sk) { __hlist_del(&sk->sk_bind_node); } static inline void sk_add_bind_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_bind_node, list); } #define sk_for_each(__sk, list) \ hlist_for_each_entry(__sk, list, sk_node) #define sk_for_each_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, sk_node) #define sk_nulls_for_each(__sk, node, list) \ hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) #define sk_nulls_for_each_rcu(__sk, node, list) \ hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node) #define sk_for_each_from(__sk) \ hlist_for_each_entry_from(__sk, sk_node) #define sk_nulls_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \ hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node) #define sk_for_each_safe(__sk, tmp, list) \ hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) #define sk_for_each_bound_safe(__sk, tmp, list) \ hlist_for_each_entry_safe(__sk, tmp, list, sk_bind_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @offset: offset of hlist_node within the struct. * */ #define sk_for_each_entry_offset_rcu(tpos, pos, head, offset) \ for (pos = rcu_dereference(hlist_first_rcu(head)); \ pos != NULL && \ ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \ pos = rcu_dereference(hlist_next_rcu(pos))) static inline struct user_namespace *sk_user_ns(const struct sock *sk) { /* Careful only use this in a context where these parameters * can not change and must all be valid, such as recvmsg from * userspace. */ return sk->sk_socket->file->f_cred->user_ns; } /* Sock flags */ enum sock_flags { SOCK_DEAD, SOCK_DONE, SOCK_URGINLINE, SOCK_KEEPOPEN, SOCK_LINGER, SOCK_DESTROY, SOCK_BROADCAST, SOCK_TIMESTAMP, SOCK_ZAPPED, SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */ SOCK_DBG, /* %SO_DEBUG setting */ SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */ SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ SOCK_MEMALLOC, /* VM depends on this socket for swapping */ SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */ SOCK_FASYNC, /* fasync() active */ SOCK_RXQ_OVFL, SOCK_ZEROCOPY, /* buffers from userspace */ SOCK_WIFI_STATUS, /* push wifi status to userspace */ SOCK_NOFCS, /* Tell NIC not to do the Ethernet FCS. * Will use last 4 bytes of packet sent from * user-space instead. */ SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */ SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ SOCK_TXTIME, SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */ SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */ SOCK_TIMESTAMPING_ANY, /* Copy of sk_tsflags & TSFLAGS_ANY */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) /* * The highest bit of sk_tsflags is reserved for kernel-internal * SOCKCM_FLAG_TS_OPT_ID. There is a check in core/sock.c to control that * SOF_TIMESTAMPING* values do not reach this reserved area */ #define SOCKCM_FLAG_TS_OPT_ID BIT(31) static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk) { nsk->sk_flags = osk->sk_flags; } static inline void sock_set_flag(struct sock *sk, enum sock_flags flag) { __set_bit(flag, &sk->sk_flags); } static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag) { __clear_bit(flag, &sk->sk_flags); } static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, int valbool) { if (valbool) sock_set_flag(sk, bit); else sock_reset_flag(sk, bit); } static inline bool sock_flag(const struct sock *sk, enum sock_flags flag) { return test_bit(flag, &sk->sk_flags); } #ifdef CONFIG_NET DECLARE_STATIC_KEY_FALSE(memalloc_socks_key); static inline int sk_memalloc_socks(void) { return static_branch_unlikely(&memalloc_socks_key); } void __receive_sock(struct file *file); #else static inline int sk_memalloc_socks(void) { return 0; } static inline void __receive_sock(struct file *file) { } #endif static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask) { return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC); } static inline void sk_acceptq_removed(struct sock *sk) { WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1); } static inline void sk_acceptq_added(struct sock *sk) { WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1); } /* Note: If you think the test should be: * return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog); * Then please take a look at commit 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes.") */ static inline bool sk_acceptq_is_full(const struct sock *sk) { return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog); } /* * Compute minimal free write space needed to queue new packets. */ static inline int sk_stream_min_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_wmem_queued) >> 1; } static inline int sk_stream_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued); } static inline void sk_wmem_queued_add(struct sock *sk, int val) { WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val); } static inline void sk_forward_alloc_add(struct sock *sk, int val) { /* Paired with lockless reads of sk->sk_forward_alloc */ WRITE_ONCE(sk->sk_forward_alloc, sk->sk_forward_alloc + val); } void sk_stream_write_space(struct sock *sk); /* OOB backlog add */ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { /* dont let skb dst not refcounted, we are going to leave rcu lock */ skb_dst_force(skb); if (!sk->sk_backlog.tail) WRITE_ONCE(sk->sk_backlog.head, skb); else sk->sk_backlog.tail->next = skb; WRITE_ONCE(sk->sk_backlog.tail, skb); skb->next = NULL; } /* * Take into account size of receive queue and backlog queue * Do not take into account this skb truesize, * to allow even a single big packet to come. */ static inline bool sk_rcvqueues_full(const struct sock *sk, unsigned int limit) { unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); return qsize > limit; } /* The per-socket spinlock must be held here. */ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb, unsigned int limit) { if (sk_rcvqueues_full(sk, limit)) return -ENOBUFS; /* * If the skb was allocated from pfmemalloc reserves, only * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) return -ENOMEM; __sk_add_backlog(sk, skb); sk->sk_backlog.len += skb->truesize; return 0; } int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)); static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { if (sk_memalloc_socks() && skb_pfmemalloc(skb)) return __sk_backlog_rcv(sk, skb); return INDIRECT_CALL_INET(sk->sk_backlog_rcv, tcp_v6_do_rcv, tcp_v4_do_rcv, sk, skb); } static inline void sk_incoming_cpu_update(struct sock *sk) { int cpu = raw_smp_processor_id(); if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu)) WRITE_ONCE(sk->sk_incoming_cpu, cpu); } static inline void sock_rps_save_rxhash(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_RPS /* The following WRITE_ONCE() is paired with the READ_ONCE() * here, and another one in sock_rps_record_flow(). */ if (unlikely(READ_ONCE(sk->sk_rxhash) != skb->hash)) WRITE_ONCE(sk->sk_rxhash, skb->hash); #endif } static inline void sock_rps_reset_rxhash(struct sock *sk) { #ifdef CONFIG_RPS /* Paired with READ_ONCE() in sock_rps_record_flow() */ WRITE_ONCE(sk->sk_rxhash, 0); #endif } #define sk_wait_event(__sk, __timeo, __condition, __wait) \ ({ int __rc, __dis = __sk->sk_disconnects; \ release_sock(__sk); \ __rc = __condition; \ if (!__rc) { \ *(__timeo) = wait_woken(__wait, \ TASK_INTERRUPTIBLE, \ *(__timeo)); \ } \ sched_annotate_sleep(); \ lock_sock(__sk); \ __rc = __dis == __sk->sk_disconnects ? __condition : -EPIPE; \ __rc; \ }) int sk_stream_wait_connect(struct sock *sk, long *timeo_p); int sk_stream_wait_memory(struct sock *sk, long *timeo_p); void sk_stream_wait_close(struct sock *sk, long timeo_p); int sk_stream_error(struct sock *sk, int flags, int err); void sk_stream_kill_queues(struct sock *sk); void sk_set_memalloc(struct sock *sk); void sk_clear_memalloc(struct sock *sk); void __sk_flush_backlog(struct sock *sk); static inline bool sk_flush_backlog(struct sock *sk) { if (unlikely(READ_ONCE(sk->sk_backlog.tail))) { __sk_flush_backlog(sk); return true; } return false; } int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb); struct request_sock_ops; struct timewait_sock_ops; struct inet_hashinfo; struct raw_hashinfo; struct smc_hashinfo; struct module; struct sk_psock; /* * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes * un-modified. Special care is taken when initializing object to zero. */ static inline void sk_prot_clear_nulls(struct sock *sk, int size) { if (offsetof(struct sock, sk_node.next) != 0) memset(sk, 0, offsetof(struct sock, sk_node.next)); memset(&sk->sk_node.pprev, 0, size - offsetof(struct sock, sk_node.pprev)); } struct proto_accept_arg { int flags; int err; int is_empty; bool kern; }; /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface */ struct proto { void (*close)(struct sock *sk, long timeout); int (*pre_connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*disconnect)(struct sock *sk, int flags); struct sock * (*accept)(struct sock *sk, struct proto_accept_arg *arg); int (*ioctl)(struct sock *sk, int cmd, int *karg); int (*init)(struct sock *sk); void (*destroy)(struct sock *sk); void (*shutdown)(struct sock *sk, int how); int (*setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); void (*keepalive)(struct sock *sk, int valbool); #ifdef CONFIG_COMPAT int (*compat_ioctl)(struct sock *sk, unsigned int cmd, unsigned long arg); #endif int (*sendmsg)(struct sock *sk, struct msghdr *msg, size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*bind_add)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); bool (*bpf_bypass_getsockopt)(int level, int optname); void (*release_cb)(struct sock *sk); /* Keeping track of sk's, looking them up, and port selection methods. */ int (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); void (*put_port)(struct sock *sk); #ifdef CONFIG_BPF_SYSCALL int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); #endif /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS unsigned int inuse_idx; #endif bool (*stream_memory_free)(const struct sock *sk, int wake); bool (*sock_is_readable)(struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk); atomic_long_t *memory_allocated; /* Current allocated memory. */ int __percpu *per_cpu_fw_alloc; struct percpu_counter *sockets_allocated; /* Current number of sockets. */ /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ unsigned long *memory_pressure; long *sysctl_mem; int *sysctl_wmem; int *sysctl_rmem; u32 sysctl_wmem_offset; u32 sysctl_rmem_offset; int max_header; bool no_autobind; struct kmem_cache *slab; unsigned int obj_size; unsigned int ipv6_pinfo_offset; slab_flags_t slab_flags; unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ unsigned int __percpu *orphan_count; struct request_sock_ops *rsk_prot; struct timewait_sock_ops *twsk_prot; union { struct inet_hashinfo *hashinfo; struct udp_table *udp_table; struct raw_hashinfo *raw_hash; struct smc_hashinfo *smc_hash; } h; struct module *owner; char name[32]; struct list_head node; int (*diag_destroy)(struct sock *sk, int err); } __randomize_layout; int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); int sock_load_diag_module(int family, int protocol); INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake)); static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) return false; return sk->sk_prot->stream_memory_free ? INDIRECT_CALL_INET_1(sk->sk_prot->stream_memory_free, tcp_stream_memory_free, sk, wake) : true; } static inline bool sk_stream_memory_free(const struct sock *sk) { return __sk_stream_memory_free(sk, 0); } static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake) { return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && __sk_stream_memory_free(sk, wake); } static inline bool sk_stream_is_writeable(const struct sock *sk) { return __sk_stream_is_writeable(sk, 0); } static inline int sk_under_cgroup_hierarchy(struct sock *sk, struct cgroup *ancestor) { #ifdef CONFIG_SOCK_CGROUP_DATA return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), ancestor); #else return -ENOTSUPP; #endif } #define SK_ALLOC_PERCPU_COUNTER_BATCH 16 static inline void sk_sockets_allocated_dec(struct sock *sk) { percpu_counter_add_batch(sk->sk_prot->sockets_allocated, -1, SK_ALLOC_PERCPU_COUNTER_BATCH); } static inline void sk_sockets_allocated_inc(struct sock *sk) { percpu_counter_add_batch(sk->sk_prot->sockets_allocated, 1, SK_ALLOC_PERCPU_COUNTER_BATCH); } static inline u64 sk_sockets_allocated_read_positive(struct sock *sk) { return percpu_counter_read_positive(sk->sk_prot->sockets_allocated); } static inline int proto_sockets_allocated_sum_positive(struct proto *prot) { return percpu_counter_sum_positive(prot->sockets_allocated); } #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { int all; int val[PROTO_INUSE_NR]; }; static inline void sock_prot_inuse_add(const struct net *net, const struct proto *prot, int val) { this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); } static inline void sock_inuse_add(const struct net *net, int val) { this_cpu_add(net->core.prot_inuse->all, val); } int sock_prot_inuse_get(struct net *net, struct proto *proto); int sock_inuse_get(struct net *net); #else static inline void sock_prot_inuse_add(const struct net *net, const struct proto *prot, int val) { } static inline void sock_inuse_add(const struct net *net, int val) { } #endif /* With per-bucket locks this operation is not-atomic, so that * this version is not worse. */ static inline int __sk_prot_rehash(struct sock *sk) { sk->sk_prot->unhash(sk); return sk->sk_prot->hash(sk); } /* About 10 seconds */ #define SOCK_DESTROY_TIME (10*HZ) /* Sockets 0-1023 can't be bound to unless you are superuser */ #define PROT_SOCK 1024 #define SHUTDOWN_MASK 3 #define RCV_SHUTDOWN 1 #define SEND_SHUTDOWN 2 #define SOCK_BINDADDR_LOCK 4 #define SOCK_BINDPORT_LOCK 8 struct socket_alloc { struct socket socket; struct inode vfs_inode; }; static inline struct socket *SOCKET_I(struct inode *inode) { return &container_of(inode, struct socket_alloc, vfs_inode)->socket; } static inline struct inode *SOCK_INODE(struct socket *socket) { return &container_of(socket, struct socket_alloc, socket)->vfs_inode; } /* * Functions for memory accounting */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind); int __sk_mem_schedule(struct sock *sk, int size, int kind); void __sk_mem_reduce_allocated(struct sock *sk, int amount); void __sk_mem_reclaim(struct sock *sk, int amount); #define SK_MEM_SEND 0 #define SK_MEM_RECV 1 /* sysctl_mem values are in pages */ static inline long sk_prot_mem_limits(const struct sock *sk, int index) { return READ_ONCE(sk->sk_prot->sysctl_mem[index]); } static inline int sk_mem_pages(int amt) { return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT; } static inline bool sk_has_account(struct sock *sk) { /* return true if protocol supports memory accounting */ return !!sk->sk_prot->memory_allocated; } static inline bool sk_wmem_schedule(struct sock *sk, int size) { int delta; if (!sk_has_account(sk)) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND); } static inline bool __sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc) { int delta; if (!sk_has_account(sk)) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || pfmemalloc; } static inline bool sk_rmem_schedule(struct sock *sk, const struct sk_buff *skb, int size) { return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb)); } static inline int sk_unused_reserved_mem(const struct sock *sk) { int unused_mem; if (likely(!sk->sk_reserved_mem)) return 0; unused_mem = sk->sk_reserved_mem - sk->sk_wmem_queued - atomic_read(&sk->sk_rmem_alloc); return unused_mem > 0 ? unused_mem : 0; } static inline void sk_mem_reclaim(struct sock *sk) { int reclaimable; if (!sk_has_account(sk)) return; reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk); if (reclaimable >= (int)PAGE_SIZE) __sk_mem_reclaim(sk, reclaimable); } static inline void sk_mem_reclaim_final(struct sock *sk) { sk->sk_reserved_mem = 0; sk_mem_reclaim(sk); } static inline void sk_mem_charge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; sk_forward_alloc_add(sk, -size); } static inline void sk_mem_uncharge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; sk_forward_alloc_add(sk, size); sk_mem_reclaim(sk); } #if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) static inline void sk_owner_set(struct sock *sk, struct module *owner) { __module_get(owner); sk->sk_owner = owner; } static inline void sk_owner_clear(struct sock *sk) { sk->sk_owner = NULL; } static inline void sk_owner_put(struct sock *sk) { module_put(sk->sk_owner); } #else static inline void sk_owner_set(struct sock *sk, struct module *owner) { } static inline void sk_owner_clear(struct sock *sk) { } static inline void sk_owner_put(struct sock *sk) { } #endif /* * Macro so as to not evaluate some arguments when * lockdep is not enabled. * * Mark both the sk_lock and the sk_lock.slock as a * per-address-family lock class. */ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ do { \ sk_owner_set(sk, THIS_MODULE); \ sk->sk_lock.owned = 0; \ init_waitqueue_head(&sk->sk_lock.wq); \ spin_lock_init(&(sk)->sk_lock.slock); \ debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ sizeof((sk)->sk_lock)); \ lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ (skey), (sname)); \ lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ } while (0) static inline bool lockdep_sock_is_held(const struct sock *sk) { return lockdep_is_held(&sk->sk_lock) || lockdep_is_held(&sk->sk_lock.slock); } void lock_sock_nested(struct sock *sk, int subclass); static inline void lock_sock(struct sock *sk) { lock_sock_nested(sk, 0); } void __lock_sock(struct sock *sk); void __release_sock(struct sock *sk); void release_sock(struct sock *sk); /* BH context may only use the following locking interface. */ #define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) #define bh_lock_sock_nested(__sk) \ spin_lock_nested(&((__sk)->sk_lock.slock), \ SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock); /** * lock_sock_fast - fast version of lock_sock * @sk: socket * * This version should be used for very small section, where process won't block * return false if fast path is taken: * * sk_lock.slock locked, owned = 0, BH disabled * * return true if slow path is taken: * * sk_lock.slock unlocked, owned = 1, BH enabled */ static inline bool lock_sock_fast(struct sock *sk) { /* The sk_lock has mutex_lock() semantics here. */ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); return __lock_sock_fast(sk); } /* fast socket lock variant for caller already holding a [different] socket lock */ static inline bool lock_sock_fast_nested(struct sock *sk) { mutex_acquire(&sk->sk_lock.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_); return __lock_sock_fast(sk); } /** * unlock_sock_fast - complement of lock_sock_fast * @sk: socket * @slow: slow mode * * fast unlock socket for user context. * If slow mode is on, we call regular release_sock() */ static inline void unlock_sock_fast(struct sock *sk, bool slow) __releases(&sk->sk_lock.slock) { if (slow) { release_sock(sk); __release(&sk->sk_lock.slock); } else { mutex_release(&sk->sk_lock.dep_map, _RET_IP_); spin_unlock_bh(&sk->sk_lock.slock); } } void sockopt_lock_sock(struct sock *sk); void sockopt_release_sock(struct sock *sk); bool sockopt_ns_capable(struct user_namespace *ns, int cap); bool sockopt_capable(int cap); /* Used by processes to "lock" a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming * packets, so that we won't get any new data or any * packets that change the state of the socket. * * While locked, BH processing will add new packets to * the backlog queue. This queue is processed by the * owner of the socket lock right before it is released. * * Since ~2.3.5 it is also exclusive sleep lock serializing * accesses from user process context. */ static inline void sock_owned_by_me(const struct sock *sk) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks); #endif } static inline void sock_not_owned_by_me(const struct sock *sk) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks); #endif } static inline bool sock_owned_by_user(const struct sock *sk) { sock_owned_by_me(sk); return sk->sk_lock.owned; } static inline bool sock_owned_by_user_nocheck(const struct sock *sk) { return sk->sk_lock.owned; } static inline void sock_release_ownership(struct sock *sk) { DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk)); sk->sk_lock.owned = 0; /* The sk_lock has mutex_unlock() semantics: */ mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } /* no reclassification while locks are held */ static inline bool sock_allow_reclassification(const struct sock *csk) { struct sock *sk = (struct sock *)csk; return !sock_owned_by_user_nocheck(sk) && !spin_is_locked(&sk->sk_lock.slock); } struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern); void sk_free(struct sock *sk); void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); void __sock_wfree(struct sk_buff *skb); void sock_wfree(struct sk_buff *skb); struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, gfp_t priority); void skb_orphan_partial(struct sk_buff *skb); void sock_rfree(struct sk_buff *skb); void sock_efree(struct sk_buff *skb); #ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); void sock_pfree(struct sk_buff *skb); static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); if (refcount_inc_not_zero(&sk->sk_refcnt)) { skb->sk = sk; skb->destructor = sock_edemux; } } #else #define sock_edemux sock_efree #endif int sk_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int sock_setsockopt(struct socket *sock, int level, int op, sockptr_t optval, unsigned int optlen); int do_sock_setsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, int optlen); int do_sock_getsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, sockptr_t optlen); int sk_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32); struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order); static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); } void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); void *sock_kmemdup(struct sock *sk, const void *src, int size, gfp_t priority); void sock_kfree_s(struct sock *sk, void *mem, int size); void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); static inline void sock_replace_proto(struct sock *sk, struct proto *proto) { if (sk->sk_socket) clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); WRITE_ONCE(sk->sk_prot, proto); } struct sockcm_cookie { u64 transmit_time; u32 mark; u32 tsflags; u32 ts_opt_id; u32 priority; u32 dmabuf_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { *sockc = (struct sockcm_cookie) { .mark = READ_ONCE(sk->sk_mark), .tsflags = READ_ONCE(sk->sk_tsflags), .priority = READ_ONCE(sk->sk_priority), }; } int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc); /* * Functions to fill in entries in struct proto_ops when a protocol * does not implement a particular function. */ int sock_no_bind(struct socket *, struct sockaddr *, int); int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); int sock_no_getname(struct socket *, struct sockaddr *, int); int sock_no_ioctl(struct socket *, unsigned int, unsigned long); int sock_no_listen(struct socket *, int); int sock_no_shutdown(struct socket *, int); int sock_no_sendmsg(struct socket *, struct msghdr *, size_t); int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len); int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); /* * Functions to fill in entries in struct proto_ops when a protocol * uses the inet style. */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int sock_common_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen); void sk_common_release(struct sock *sk); /* * Default socket callbacks and setup code */ /* Initialise core socket variables using an explicit uid. */ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid); /* Initialise core socket variables. * Assumes struct socket *sock is embedded in a struct socket_alloc. */ void sock_init_data(struct socket *sock, struct sock *sk); /* * Socket reference counting postulates. * * * Each user of socket SHOULD hold a reference count. * * Each access point to socket (an hash table bucket, reference from a list, * running timer, skb in flight MUST hold a reference count. * * When reference count hits 0, it means it will never increase back. * * When reference count hits 0, it means that no references from * outside exist to this socket and current process on current CPU * is last user and may/should destroy this socket. * * sk_free is called from any context: process, BH, IRQ. When * it is called, socket has no references from outside -> sk_free * may release descendant resources allocated by the socket, but * to the time when it is called, socket is NOT referenced by any * hash tables, lists etc. * * Packets, delivered from outside (from network or from another process) * and enqueued on receive/error queues SHOULD NOT grab reference count, * when they sit in queue. Otherwise, packets will leak to hole, when * socket is looked up by one cpu and unhasing is made by another CPU. * It is true for udp/raw, netlink (leak to receive and error queues), tcp * (leak to backlog). Packet socket does all the processing inside * BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets * use separate SMP lock, so that they are prone too. */ /* Ungrab socket and destroy it, if it was the last reference. */ static inline void sock_put(struct sock *sk) { if (refcount_dec_and_test(&sk->sk_refcnt)) sk_free(sk); } /* Generic version of sock_put(), dealing with all sockets * (TCP_TIMEWAIT, TCP_NEW_SYN_RECV, ESTABLISHED...) */ void sock_gen_put(struct sock *sk); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted); static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) { return __sk_receive_skb(sk, skb, nested, 1, true); } static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) { /* sk_tx_queue_mapping accept only upto a 16-bit value */ if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX)) return; /* Paired with READ_ONCE() in sk_tx_queue_get() and * other WRITE_ONCE() because socket lock might be not held. */ WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); } #define NO_QUEUE_MAPPING USHRT_MAX static inline void sk_tx_queue_clear(struct sock *sk) { /* Paired with READ_ONCE() in sk_tx_queue_get() and * other WRITE_ONCE() because socket lock might be not held. */ WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING); } static inline int sk_tx_queue_get(const struct sock *sk) { if (sk) { /* Paired with WRITE_ONCE() in sk_tx_queue_clear() * and sk_tx_queue_set(). */ int val = READ_ONCE(sk->sk_tx_queue_mapping); if (val != NO_QUEUE_MAPPING) return val; } return -1; } static inline void __sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb, bool force_set) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING if (skb_rx_queue_recorded(skb)) { u16 rx_queue = skb_get_rx_queue(skb); if (force_set || unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue)) WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue); } #endif } static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) { __sk_rx_queue_set(sk, skb, true); } static inline void sk_rx_queue_update(struct sock *sk, const struct sk_buff *skb) { __sk_rx_queue_set(sk, skb, false); } static inline void sk_rx_queue_clear(struct sock *sk) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING); #endif } static inline int sk_rx_queue_get(const struct sock *sk) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING if (sk) { int res = READ_ONCE(sk->sk_rx_queue_mapping); if (res != NO_QUEUE_MAPPING) return res; } #endif return -1; } static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk->sk_socket = sock; if (sock) { WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid); WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino); } else { /* Note: sk_uid is unchanged. */ WRITE_ONCE(sk->sk_ino, 0); } } static inline wait_queue_head_t *sk_sleep(struct sock *sk) { BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0); return &rcu_dereference_raw(sk->sk_wq)->wait; } /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. * Note that parent inode held reference count on this struct sock, * we do not release it in this function, because protocol * probably wants some additional cleanups or even continuing * to work with this socket (TCP). */ static inline void sock_orphan(struct sock *sk) { write_lock_bh(&sk->sk_callback_lock); sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); sk->sk_wq = NULL; write_unlock_bh(&sk->sk_callback_lock); } static inline void sock_graft(struct sock *sk, struct socket *parent) { WARN_ON(parent->sk); write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); parent->sk = sk; sk_set_socket(sk, parent); security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } static inline unsigned long sock_i_ino(const struct sock *sk) { /* Paired with WRITE_ONCE() in sock_graft() and sock_orphan() */ return READ_ONCE(sk->sk_ino); } static inline kuid_t sk_uid(const struct sock *sk) { /* Paired with WRITE_ONCE() in sockfs_setattr() */ return READ_ONCE(sk->sk_uid); } static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) { return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0); } static inline u32 net_tx_rndhash(void) { u32 v = get_random_u32(); return v ?: 1; } static inline void sk_set_txhash(struct sock *sk) { /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); } static inline bool sk_rethink_txhash(struct sock *sk) { if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) { sk_set_txhash(sk); return true; } return false; } static inline struct dst_entry * __sk_dst_get(const struct sock *sk) { return rcu_dereference_check(sk->sk_dst_cache, lockdep_sock_is_held(sk)); } static inline struct dst_entry * sk_dst_get(const struct sock *sk) { struct dst_entry *dst; rcu_read_lock(); dst = rcu_dereference(sk->sk_dst_cache); if (dst && !rcuref_get(&dst->__rcuref)) dst = NULL; rcu_read_unlock(); return dst; } static inline void __dst_negative_advice(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst->ops->negative_advice) dst->ops->negative_advice(sk, dst); } static inline void dst_negative_advice(struct sock *sk) { sk_rethink_txhash(sk); __dst_negative_advice(sk); } static inline void __sk_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old_dst; sk_tx_queue_clear(sk); WRITE_ONCE(sk->sk_dst_pending_confirm, 0); old_dst = rcu_dereference_protected(sk->sk_dst_cache, lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_dst_cache, dst); dst_release(old_dst); } static inline void sk_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old_dst; sk_tx_queue_clear(sk); WRITE_ONCE(sk->sk_dst_pending_confirm, 0); old_dst = unrcu_pointer(xchg(&sk->sk_dst_cache, RCU_INITIALIZER(dst))); dst_release(old_dst); } static inline void __sk_dst_reset(struct sock *sk) { __sk_dst_set(sk, NULL); } static inline void sk_dst_reset(struct sock *sk) { sk_dst_set(sk, NULL); } struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); static inline void sk_dst_confirm(struct sock *sk) { if (!READ_ONCE(sk->sk_dst_pending_confirm)) WRITE_ONCE(sk->sk_dst_pending_confirm, 1); } static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) { if (skb_get_dst_pending_confirm(skb)) { struct sock *sk = skb->sk; if (sk && READ_ONCE(sk->sk_dst_pending_confirm)) WRITE_ONCE(sk->sk_dst_pending_confirm, 0); neigh_confirm(n); } } bool sk_mc_loop(const struct sock *sk); static inline bool sk_can_gso(const struct sock *sk) { return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); } void sk_setup_caps(struct sock *sk, struct dst_entry *dst); static inline void sk_gso_disable(struct sock *sk) { sk->sk_gso_disabled = 1; sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, char *to, int copy, int offset) { if (skb->ip_summed == CHECKSUM_NONE) { __wsum csum = 0; if (!csum_and_copy_from_iter_full(to, copy, &csum, from)) return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, offset); } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { if (!copy_from_iter_full_nocache(to, copy, from)) return -EFAULT; } else if (!copy_from_iter_full(to, copy, from)) return -EFAULT; return 0; } static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, int copy) { int err, offset = skb->len; err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy), copy, offset); if (err) __skb_trim(skb, offset); return err; } static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from, struct sk_buff *skb, struct page *page, int off, int copy) { int err; err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off, copy, skb->len); if (err) return err; skb_len_add(skb, copy); sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); return 0; } /** * sk_wmem_alloc_get - returns write allocations * @sk: socket * * Return: sk_wmem_alloc minus initial offset of one */ static inline int sk_wmem_alloc_get(const struct sock *sk) { return refcount_read(&sk->sk_wmem_alloc) - 1; } /** * sk_rmem_alloc_get - returns read allocations * @sk: socket * * Return: sk_rmem_alloc */ static inline int sk_rmem_alloc_get(const struct sock *sk) { return atomic_read(&sk->sk_rmem_alloc); } /** * sk_has_allocations - check if allocations are outstanding * @sk: socket * * Return: true if socket has write or read allocations */ static inline bool sk_has_allocations(const struct sock *sk) { return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk); } /** * skwq_has_sleeper - check if there are any waiting processes * @wq: struct socket_wq * * Return: true if socket_wq has waiting processes * * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory * barrier call. They were added due to the race found within the tcp code. * * Consider following tcp code paths:: * * CPU1 CPU2 * sys_select receive packet * ... ... * __add_wait_queue update tp->rcv_nxt * ... ... * tp->rcv_nxt check sock_def_readable * ... { * schedule rcu_read_lock(); * wq = rcu_dereference(sk->sk_wq); * if (wq && waitqueue_active(&wq->wait)) * wake_up_interruptible(&wq->wait) * ... * } * * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 * could then endup calling schedule and sleep forever if there are no more * data on the socket. * */ static inline bool skwq_has_sleeper(struct socket_wq *wq) { return wq && wq_has_sleeper(&wq->wait); } /** * sock_poll_wait - wrapper for the poll_wait call. * @filp: file * @sock: socket to wait on * @p: poll_table * * See the comments in the wq_has_sleeper function. */ static inline void sock_poll_wait(struct file *filp, struct socket *sock, poll_table *p) { /* Provides a barrier we need to be sure we are in sync * with the socket flags modification. * * This memory barrier is paired in the wq_has_sleeper. */ poll_wait(filp, &sock->wq.wait, p); } static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) { /* This pairs with WRITE_ONCE() in sk_set_txhash() */ u32 txhash = READ_ONCE(sk->sk_txhash); if (txhash) { skb->l4_hash = 1; skb->hash = txhash; } } void skb_set_owner_w(struct sk_buff *skb, struct sock *sk); /* * Queue a received datagram if it will fit. Stream and sequenced * protocols can't normally use this as they need to fit buffers in * and play with them. * * Inlined as it's very short and called for pretty much every * packet ever received. */ static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; skb->destructor = sock_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); } static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk) { if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) { skb_orphan(skb); skb->destructor = sock_efree; skb->sk = sk; return true; } return false; } static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk) { skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); if (skb) { if (sk_rmem_schedule(sk, skb, skb->truesize)) { skb_set_owner_r(skb, sk); return skb; } __kfree_skb(skb); } return NULL; } static inline void skb_prepare_for_gro(struct sk_buff *skb) { if (skb->destructor != sock_wfree) { skb_orphan(skb); return; } skb->slow_gro = 1; } void sk_reset_timer(struct sock *sk, struct timer_list *timer, unsigned long expires); void sk_stop_timer(struct sock *sk, struct timer_list *timer); void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer); int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb)); int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { return sock_queue_rcv_skb_reason(sk, skb, NULL); } int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); struct sk_buff *sock_dequeue_err_skb(struct sock *sk); /* * Recover an error report and clear atomically */ static inline int sock_error(struct sock *sk) { int err; /* Avoid an atomic operation for the common case. * This is racy since another cpu/thread can change sk_err under us. */ if (likely(data_race(!sk->sk_err))) return 0; err = xchg(&sk->sk_err, 0); return -err; } void sk_error_report(struct sock *sk); static inline unsigned long sock_wspace(struct sock *sk) { int amt = 0; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc); if (amt < 0) amt = 0; } return amt; } /* Note: * We use sk->sk_wq_raw, from contexts knowing this * pointer is not NULL and cannot disappear/change. */ static inline void sk_set_bit(int nr, struct sock *sk) { if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) && !sock_flag(sk, SOCK_FASYNC)) return; set_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_clear_bit(int nr, struct sock *sk) { if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) && !sock_flag(sk, SOCK_FASYNC)) return; clear_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_wake_async(const struct sock *sk, int how, int band) { if (sock_flag(sk, SOCK_FASYNC)) { rcu_read_lock(); sock_wake_async(rcu_dereference(sk->sk_wq), how, band); rcu_read_unlock(); } } static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band) { if (unlikely(sock_flag(sk, SOCK_FASYNC))) sock_wake_async(rcu_dereference(sk->sk_wq), how, band); } /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. * Note: for send buffers, TCP works better if we can build two skbs at * minimum. */ #define TCP_SKB_MIN_TRUESIZE (2048 + SKB_DATA_ALIGN(sizeof(struct sk_buff))) #define SOCK_MIN_SNDBUF (TCP_SKB_MIN_TRUESIZE * 2) #define SOCK_MIN_RCVBUF TCP_SKB_MIN_TRUESIZE static inline void sk_stream_moderate_sndbuf(struct sock *sk) { u32 val; if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) return; val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); val = max_t(u32, val, sk_unused_reserved_mem(sk)); WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF)); } /** * sk_page_frag - return an appropriate page_frag * @sk: socket * * Use the per task page_frag instead of the per socket one for * optimization when we know that we're in process context and own * everything that's associated with %current. * * Both direct reclaim and page faults can nest inside other * socket operations and end up recursing into sk_page_frag() * while it's already in use: explicitly avoid task page_frag * when users disable sk_use_task_frag. * * Return: a per task page_frag if context allows that, * otherwise a per socket one. */ static inline struct page_frag *sk_page_frag(struct sock *sk) { if (sk->sk_use_task_frag) return &current->task_frag; return &sk->sk_frag; } bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); /* * Default write policy as shown to user space via poll/select/SIGIO */ static inline bool sock_writeable(const struct sock *sk) { return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1); } static inline gfp_t gfp_any(void) { return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } static inline gfp_t gfp_memcg_charge(void) { return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo); } static inline long sock_sndtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : READ_ONCE(sk->sk_sndtimeo); } static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) { int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len); return v ?: 1; } /* Alas, with timeout socket operations are not restartable. * Compare this to poll(). */ static inline int sock_intr_errno(long timeo) { return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR; } struct sock_skb_cb { u32 dropcount; }; /* Store sock_skb_cb at the end of skb->cb[] so protocol families * using skb->cb[] would keep using it directly and utilize its * alignment guarantee. */ #define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \ sizeof(struct sock_skb_cb)) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ SOCK_SKB_CB_OFFSET)) #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) static inline void sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) { SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? atomic_read(&sk->sk_drops) : 0; } static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) { int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); atomic_add(segs, &sk->sk_drops); } static inline ktime_t sock_read_timestamp(struct sock *sk) { #if BITS_PER_LONG==32 unsigned int seq; ktime_t kt; do { seq = read_seqbegin(&sk->sk_stamp_seq); kt = sk->sk_stamp; } while (read_seqretry(&sk->sk_stamp_seq, seq)); return kt; #else return READ_ONCE(sk->sk_stamp); #endif } static inline void sock_write_timestamp(struct sock *sk, ktime_t kt) { #if BITS_PER_LONG==32 write_seqlock(&sk->sk_stamp_seq); sk->sk_stamp = kt; write_sequnlock(&sk->sk_stamp_seq); #else WRITE_ONCE(sk->sk_stamp, kt); #endif } void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk); int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk, struct timespec64 *ts); static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); u32 tsflags = READ_ONCE(sk->sk_tsflags); ktime_t kt = skb->tstamp; /* * generate control messages if * - receive time stamping in software requested * - software time stamp available and wanted * - hardware time stamps available and wanted */ if (sock_flag(sk, SOCK_RCVTSTAMP) || (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) || (hwtstamps->hwtstamp && (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); else sock_write_timestamp(sk, kt); if (sock_flag(sk, SOCK_WIFI_STATUS) && skb_wifi_acked_valid(skb)) __sock_recv_wifi_status(msg, sk, skb); } void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); #define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC) static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { #define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_RCVTSTAMP) | \ (1UL << SOCK_RCVMARK) | \ (1UL << SOCK_RCVPRIORITY) | \ (1UL << SOCK_TIMESTAMPING_ANY)) #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) if (READ_ONCE(sk->sk_flags) & FLAGS_RECV_CMSGS) __sock_recv_cmsgs(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sock_write_timestamp(sk, skb->tstamp); else if (unlikely(sock_read_timestamp(sk) == SK_DEFAULT_STAMP)) sock_write_timestamp(sk, 0); } void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags); /** * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet * @sockc: pointer to socket cmsg cookie to get timestamping info * @tx_flags: completed with instructions for time stamping * @tskey: filled in with next sk_tskey (not for TCP, which uses seqno) * * Note: callers should take care of initial ``*tx_flags`` value (usually 0) */ static inline void _sock_tx_timestamp(struct sock *sk, const struct sockcm_cookie *sockc, __u8 *tx_flags, __u32 *tskey) { __u32 tsflags = sockc->tsflags; if (unlikely(tsflags)) { __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) { if (tsflags & SOCKCM_FLAG_TS_OPT_ID) *tskey = sockc->ts_opt_id; else *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } } } static inline void sock_tx_timestamp(struct sock *sk, const struct sockcm_cookie *sockc, __u8 *tx_flags) { _sock_tx_timestamp(sk, sockc, tx_flags, NULL); } static inline void skb_setup_tx_timestamp(struct sk_buff *skb, const struct sockcm_cookie *sockc) { _sock_tx_timestamp(skb->sk, sockc, &skb_shinfo(skb)->tx_flags, &skb_shinfo(skb)->tskey); } static inline bool sk_is_inet(const struct sock *sk) { int family = READ_ONCE(sk->sk_family); return family == AF_INET || family == AF_INET6; } static inline bool sk_is_tcp(const struct sock *sk) { return sk_is_inet(sk) && sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP; } static inline bool sk_is_udp(const struct sock *sk) { return sk_is_inet(sk) && sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP; } static inline bool sk_is_unix(const struct sock *sk) { return sk->sk_family == AF_UNIX; } static inline bool sk_is_stream_unix(const struct sock *sk) { return sk_is_unix(sk) && sk->sk_type == SOCK_STREAM; } static inline bool sk_is_vsock(const struct sock *sk) { return sk->sk_family == AF_VSOCK; } static inline bool sk_may_scm_recv(const struct sock *sk) { return (IS_ENABLED(CONFIG_UNIX) && sk->sk_family == AF_UNIX) || sk->sk_family == AF_NETLINK || (IS_ENABLED(CONFIG_BT) && sk->sk_family == AF_BLUETOOTH); } /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from * @skb: socket buffer to eat * * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. */ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } static inline bool skb_sk_is_prefetched(struct sk_buff *skb) { #ifdef CONFIG_INET return skb->destructor == sock_pfree; #else return false; #endif /* CONFIG_INET */ } /* This helper checks if a socket is a full socket, * ie _not_ a timewait or request socket. */ static inline bool sk_fullsock(const struct sock *sk) { return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV); } static inline bool sk_is_refcounted(struct sock *sk) { /* Only full sockets have sk->sk_flags. */ return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); } static inline bool sk_requests_wifi_status(struct sock *sk) { return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS); } /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. * Check decrypted mark in case skb_orphan() cleared socket. */ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sock *sk = skb->sk; if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { skb = sk->sk_validate_xmit_skb(sk, dev, skb); } else if (unlikely(skb_is_decrypted(skb))) { pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); kfree_skb(skb); skb = NULL; } #endif return skb; } /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV * SYNACK messages can be attached to either ones (depending on SYNCOOKIE) */ static inline bool sk_listener(const struct sock *sk) { return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE) * TCP RST and ACK can be attached to TIME_WAIT. */ static inline bool sk_listener_or_tw(const struct sock *sk) { return (1 << READ_ONCE(sk->sk_state)) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT); } void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); bool sk_ns_capable(const struct sock *sk, struct user_namespace *user_ns, int cap); bool sk_capable(const struct sock *sk, int cap); bool sk_net_capable(const struct sock *sk, int cap); void sk_get_meminfo(const struct sock *sk, u32 *meminfo); /* Take into consideration the size of the struct sk_buff overhead in the * determination of these values, since that is non-constant across * platforms. This makes socket queueing behavior and performance * not depend upon such differences. */ #define _SK_MEM_PACKETS 256 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; #define SKB_FRAG_PAGE_ORDER get_order(32768) DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_wmem ? */ if (proto->sysctl_wmem_offset) return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset)); return READ_ONCE(*proto->sysctl_wmem); } static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_rmem ? */ if (proto->sysctl_rmem_offset) return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset)); return READ_ONCE(*proto->sysctl_rmem); } /* Default TCP Small queue budget is ~1 ms of data (1sec >> 10) * Some wifi drivers need to tweak it to get more chunks. * They can use this helper from their ndo_start_xmit() */ static inline void sk_pacing_shift_update(struct sock *sk, int val) { if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val) return; WRITE_ONCE(sk->sk_pacing_shift, val); } /* if a socket is bound to a device, check that the given device * index is either the same or that the socket is bound to an L3 * master device and the given device index is also enslaved to * that L3 master */ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) { int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); int mdif; if (!bound_dev_if || bound_dev_if == dif) return true; mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif); if (mdif && mdif == bound_dev_if) return true; return false; } void sock_def_readable(struct sock *sk); int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk); void sock_set_timestamp(struct sock *sk, int optname, bool valbool); int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping); #if defined(CONFIG_CGROUP_BPF) void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op); #else static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) { } #endif void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_rcvbuf(struct sock *sk, int val); void sock_set_mark(struct sock *sk, u32 val); void sock_set_reuseaddr(struct sock *sk); void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); int sock_get_timeout(long timeo, void *optval, bool old_timeval); int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, sockptr_t optval, int optlen, bool old_timeval); int sock_ioctl_inout(struct sock *sk, unsigned int cmd, void __user *arg, void *karg, size_t size); int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); static inline bool sk_is_readable(struct sock *sk) { const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot->sock_is_readable) return prot->sock_is_readable(sk); return false; } #endif /* _SOCK_H */
783 785 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0 /* * tracing clocks * * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * * Implements 3 trace clock variants, with differing scalability/precision * tradeoffs: * * - local: CPU-local trace clock * - medium: scalable global clock with some jitter * - global: globally monotonic, serialized clock * * Tracer plugins will chose a default from these clocks. */ #include <linux/spinlock.h> #include <linux/irqflags.h> #include <linux/hardirq.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/ktime.h> #include <linux/trace_clock.h> /* * trace_clock_local(): the simplest and least coherent tracing clock. * * Useful for tracing that does not cross to other CPUs nor * does it go through idle events. */ u64 notrace trace_clock_local(void) { u64 clock; /* * sched_clock() is an architecture implemented, fast, scalable, * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ preempt_disable_notrace(); clock = sched_clock(); preempt_enable_notrace(); return clock; } EXPORT_SYMBOL_GPL(trace_clock_local); /* * trace_clock(): 'between' trace clock. Not completely serialized, * but not completely incorrect when crossing CPUs either. * * This is based on cpu_clock(), which will allow at most ~1 jiffy of * jitter between CPUs. So it's a pretty scalable clock, but there * can be offsets in the trace data. */ u64 notrace trace_clock(void) { return local_clock(); } EXPORT_SYMBOL_GPL(trace_clock); /* * trace_jiffy_clock(): Simply use jiffies as a clock counter. * Note that this use of jiffies_64 is not completely safe on * 32-bit systems. But the window is tiny, and the effect if * we are affected is that we will have an obviously bogus * timestamp on a trace event - i.e. not life threatening. */ u64 notrace trace_clock_jiffies(void) { return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); } EXPORT_SYMBOL_GPL(trace_clock_jiffies); /* * trace_clock_global(): special globally coherent trace clock * * It has higher overhead than the other trace clocks but is still * an order of magnitude faster than GTOD derived hardware clocks. * * Used by plugins that need globally coherent timestamps. */ /* keep prev_time and lock in the same cacheline. */ static struct { u64 prev_time; arch_spinlock_t lock; } trace_clock_struct ____cacheline_aligned_in_smp = { .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED, }; u64 notrace trace_clock_global(void) { unsigned long flags; int this_cpu; u64 now, prev_time; raw_local_irq_save(flags); this_cpu = raw_smp_processor_id(); /* * The global clock "guarantees" that the events are ordered * between CPUs. But if two events on two different CPUS call * trace_clock_global at roughly the same time, it really does * not matter which one gets the earlier time. Just make sure * that the same CPU will always show a monotonic clock. * * Use a read memory barrier to get the latest written * time that was recorded. */ smp_rmb(); prev_time = READ_ONCE(trace_clock_struct.prev_time); now = sched_clock_cpu(this_cpu); /* Make sure that now is always greater than or equal to prev_time */ if ((s64)(now - prev_time) < 0) now = prev_time; /* * If in an NMI context then dont risk lockups and simply return * the current time. */ if (unlikely(in_nmi())) goto out; /* Tracing can cause strange recursion, always use a try lock */ if (arch_spin_trylock(&trace_clock_struct.lock)) { /* Reread prev_time in case it was already updated */ prev_time = READ_ONCE(trace_clock_struct.prev_time); if ((s64)(now - prev_time) < 0) now = prev_time; trace_clock_struct.prev_time = now; /* The unlock acts as the wmb for the above rmb */ arch_spin_unlock(&trace_clock_struct.lock); } out: raw_local_irq_restore(flags); return now; } EXPORT_SYMBOL_GPL(trace_clock_global); static atomic64_t trace_counter; /* * trace_clock_counter(): simply an atomic counter. * Use the trace_counter "counter" for cases where you do not care * about timings, but are interested in strict ordering. */ u64 notrace trace_clock_counter(void) { return atomic64_inc_return(&trace_counter); }
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ #ifndef _ASM_X86_STACKTRACE_H #define _ASM_X86_STACKTRACE_H #include <linux/uaccess.h> #include <linux/ptrace.h> #include <asm/cpu_entry_area.h> #include <asm/switch_to.h> enum stack_type { STACK_TYPE_UNKNOWN, STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, STACK_TYPE_ENTRY, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, }; struct stack_info { enum stack_type type; unsigned long *begin, *end, *next_sp; }; bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); bool in_entry_stack(unsigned long *stack, struct stack_info *info); int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, struct stack_info *info); static __always_inline bool get_stack_guard_info(unsigned long *stack, struct stack_info *info) { /* make sure it's not in the stack proper */ if (get_stack_info_noinstr(stack, current, info)) return false; /* but if it is in the page below it, we hit a guard */ return get_stack_info_noinstr((void *)stack + PAGE_SIZE, current, info); } const char *stack_type_name(enum stack_type type); static inline bool on_stack(struct stack_info *info, void *addr, size_t len) { void *begin = info->begin; void *end = info->end; return (info->type != STACK_TYPE_UNKNOWN && addr >= begin && addr < end && addr + len > begin && addr + len <= end); } #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else #define STACKSLOTS_PER_LINE 4 #endif #ifdef CONFIG_FRAME_POINTER static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->bp; if (task == current) return __builtin_frame_address(0); return &((struct inactive_task_frame *)task->thread.sp)->bp; } #else static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { return NULL; } #endif /* CONFIG_FRAME_POINTER */ static inline unsigned long * get_stack_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->sp; if (task == current) return __builtin_frame_address(0); return (unsigned long *)task->thread.sp; } /* The form of the top of the frame on the stack */ struct stack_frame { struct stack_frame *next_frame; unsigned long return_address; }; struct stack_frame_ia32 { u32 next_frame; u32 return_address; }; void show_opcodes(struct pt_regs *regs, const char *loglvl); void show_ip(struct pt_regs *regs, const char *loglvl); #endif /* _ASM_X86_STACKTRACE_H */
2 1 2 2 2 1 1 1 1 2 1 2 2 2 1 1 1 1 1 1 2 3 2 1 2 1 1 1 2 1 1 1 2 18 18 16 16 2 1 3 3 1 2 2 1 1 10 9 20 6 18 1 19 20 1 153 150 150 148 3 1 1 1 1 1 14 6 1 5 1 3 3 1 2 2 1 1 1 1 2 1 1 1 17 16 3 14 2 3 2 1 55 55 1 38 21 21 3 5 1 2 3 10 5 8 6 6 2 1 3 4 4 4 4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 // SPDX-License-Identifier: GPL-2.0-only /* * KVM Microsoft Hyper-V emulation * * derived from arch/x86/kvm/x86.c * * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 * Copyright 2010 Red Hat, Inc. and/or its affiliates. * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com> * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> * Amit Shah <amit.shah@qumranet.com> * Ben-Ami Yassour <benami@il.ibm.com> * Andrey Smetanin <asmetanin@virtuozzo.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "lapic.h" #include "ioapic.h" #include "cpuid.h" #include "hyperv.h" #include "mmu.h" #include "xen.h" #include <linux/cpu.h> #include <linux/kvm_host.h> #include <linux/highmem.h> #include <linux/sched/cputime.h> #include <linux/spinlock.h> #include <linux/eventfd.h> #include <asm/apicdef.h> #include <asm/mshyperv.h> #include <trace/events/kvm.h> #include "trace.h" #include "irq.h" #include "fpu.h" #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK) /* * As per Hyper-V TLFS, extended hypercalls start from 0x8001 * (HvExtCallQueryCapabilities). Response of this hypercalls is a 64 bit value * where each bit tells which extended hypercall is available besides * HvExtCallQueryCapabilities. * * 0x8001 - First extended hypercall, HvExtCallQueryCapabilities, no bit * assigned. * * 0x8002 - Bit 0 * 0x8003 - Bit 1 * .. * 0x8041 - Bit 63 * * Therefore, HV_EXT_CALL_MAX = 0x8001 + 64 */ #define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64) static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick); static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) { return atomic64_read(&synic->sint[sint]); } static inline int synic_get_sint_vector(u64 sint_value) { if (sint_value & HV_SYNIC_SINT_MASKED) return -1; return sint_value & HV_SYNIC_SINT_VECTOR_MASK; } static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic, int vector) { int i; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) return true; } return false; } static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, int vector) { int i; u64 sint_value; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { sint_value = synic_read_sint(synic, i); if (synic_get_sint_vector(sint_value) == vector && sint_value & HV_SYNIC_SINT_AUTO_EOI) return true; } return false; } static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, int vector) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); bool auto_eoi_old, auto_eoi_new; if (vector < HV_SYNIC_FIRST_VALID_VECTOR) return; if (synic_has_vector_connected(synic, vector)) __set_bit(vector, synic->vec_bitmap); else __clear_bit(vector, synic->vec_bitmap); auto_eoi_old = !bitmap_empty(synic->auto_eoi_bitmap, 256); if (synic_has_vector_auto_eoi(synic, vector)) __set_bit(vector, synic->auto_eoi_bitmap); else __clear_bit(vector, synic->auto_eoi_bitmap); auto_eoi_new = !bitmap_empty(synic->auto_eoi_bitmap, 256); if (auto_eoi_old == auto_eoi_new) return; if (!enable_apicv) return; down_write(&vcpu->kvm->arch.apicv_update_lock); if (auto_eoi_new) hv->synic_auto_eoi_used++; else hv->synic_auto_eoi_used--; /* * Inhibit APICv if any vCPU is using SynIC's AutoEOI, which relies on * the hypervisor to manually inject IRQs. */ __kvm_set_or_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_HYPERV, !!hv->synic_auto_eoi_used); up_write(&vcpu->kvm->arch.apicv_update_lock); } static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, u64 data, bool host) { int vector, old_vector; bool masked; vector = data & HV_SYNIC_SINT_VECTOR_MASK; masked = data & HV_SYNIC_SINT_MASKED; /* * Valid vectors are 16-255, however, nested Hyper-V attempts to write * default '0x10000' value on boot and this should not #GP. We need to * allow zero-initing the register from host as well. */ if (vector < HV_SYNIC_FIRST_VALID_VECTOR && !host && !masked) return 1; /* * Guest may configure multiple SINTs to use the same vector, so * we maintain a bitmap of vectors handled by synic, and a * bitmap of vectors with auto-eoi behavior. The bitmaps are * updated here, and atomically queried on fast paths. */ old_vector = synic_read_sint(synic, sint) & HV_SYNIC_SINT_VECTOR_MASK; atomic64_set(&synic->sint[sint], data); synic_update_vector(synic, old_vector); synic_update_vector(synic, vector); /* Load SynIC vectors into EOI exit bitmap */ kvm_make_request(KVM_REQ_SCAN_IOAPIC, hv_synic_to_vcpu(synic)); return 0; } static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu = NULL; unsigned long i; if (vpidx >= KVM_MAX_VCPUS) return NULL; vcpu = kvm_get_vcpu(kvm, vpidx); if (vcpu && kvm_hv_get_vpindex(vcpu) == vpidx) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_hv_get_vpindex(vcpu) == vpidx) return vcpu; return NULL; } static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu; struct kvm_vcpu_hv_synic *synic; vcpu = get_vcpu_by_vpidx(kvm, vpidx); if (!vcpu || !to_hv_vcpu(vcpu)) return NULL; synic = to_hv_synic(vcpu); return (synic->active) ? synic : NULL; } static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) { struct kvm *kvm = vcpu->kvm; struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; int gsi, idx; trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint); /* Try to deliver pending Hyper-V SynIC timers messages */ for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { stimer = &hv_vcpu->stimer[idx]; if (stimer->msg_pending && stimer->config.enable && !stimer->config.direct_mode && stimer->config.sintx == sint) stimer_mark_pending(stimer, false); } idx = srcu_read_lock(&kvm->irq_srcu); gsi = atomic_read(&synic->sint_to_gsi[sint]); if (gsi != -1) kvm_notify_acked_gsi(kvm, gsi); srcu_read_unlock(&kvm->irq_srcu, idx); } static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC; hv_vcpu->exit.u.synic.msr = msr; hv_vcpu->exit.u.synic.control = synic->control; hv_vcpu->exit.u.synic.evt_page = synic->evt_page; hv_vcpu->exit.u.synic.msg_page = synic->msg_page; kvm_make_request(KVM_REQ_HV_EXIT, vcpu); } static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 data, bool host) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int ret; if (!synic->active && (!host || data)) return 1; trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); ret = 0; switch (msr) { case HV_X64_MSR_SCONTROL: synic->control = data; if (!host) synic_exit(synic, msr); break; case HV_X64_MSR_SVERSION: if (!host) { ret = 1; break; } synic->version = data; break; case HV_X64_MSR_SIEFP: if ((data & HV_SYNIC_SIEFP_ENABLE) && !host && !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; break; } synic->evt_page = data; if (!host) synic_exit(synic, msr); break; case HV_X64_MSR_SIMP: if ((data & HV_SYNIC_SIMP_ENABLE) && !host && !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; break; } synic->msg_page = data; if (!host) synic_exit(synic, msr); break; case HV_X64_MSR_EOM: { int i; if (!synic->active) break; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) kvm_hv_notify_acked_sint(vcpu, i); break; } case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host); break; default: ret = 1; break; } return ret; } static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); return hv_vcpu->cpuid_cache.syndbg_cap_eax & HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; } static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL) hv->hv_syndbg.control.status = vcpu->run->hyperv.u.syndbg.status; return 1; } static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG; hv_vcpu->exit.u.syndbg.msr = msr; hv_vcpu->exit.u.syndbg.control = syndbg->control.control; hv_vcpu->exit.u.syndbg.send_page = syndbg->control.send_page; hv_vcpu->exit.u.syndbg.recv_page = syndbg->control.recv_page; hv_vcpu->exit.u.syndbg.pending_page = syndbg->control.pending_page; vcpu->arch.complete_userspace_io = kvm_hv_syndbg_complete_userspace; kvm_make_request(KVM_REQ_HV_EXIT, vcpu); } static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) return 1; trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id, to_hv_vcpu(vcpu)->vp_index, msr, data); switch (msr) { case HV_X64_MSR_SYNDBG_CONTROL: syndbg->control.control = data; if (!host) syndbg_exit(vcpu, msr); break; case HV_X64_MSR_SYNDBG_STATUS: syndbg->control.status = data; break; case HV_X64_MSR_SYNDBG_SEND_BUFFER: syndbg->control.send_page = data; break; case HV_X64_MSR_SYNDBG_RECV_BUFFER: syndbg->control.recv_page = data; break; case HV_X64_MSR_SYNDBG_PENDING_BUFFER: syndbg->control.pending_page = data; if (!host) syndbg_exit(vcpu, msr); break; case HV_X64_MSR_SYNDBG_OPTIONS: syndbg->options = data; break; default: break; } return 0; } static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) return 1; switch (msr) { case HV_X64_MSR_SYNDBG_CONTROL: *pdata = syndbg->control.control; break; case HV_X64_MSR_SYNDBG_STATUS: *pdata = syndbg->control.status; break; case HV_X64_MSR_SYNDBG_SEND_BUFFER: *pdata = syndbg->control.send_page; break; case HV_X64_MSR_SYNDBG_RECV_BUFFER: *pdata = syndbg->control.recv_page; break; case HV_X64_MSR_SYNDBG_PENDING_BUFFER: *pdata = syndbg->control.pending_page; break; case HV_X64_MSR_SYNDBG_OPTIONS: *pdata = syndbg->options; break; default: break; } trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, kvm_hv_get_vpindex(vcpu), msr, *pdata); return 0; } static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, bool host) { int ret; if (!synic->active && !host) return 1; ret = 0; switch (msr) { case HV_X64_MSR_SCONTROL: *pdata = synic->control; break; case HV_X64_MSR_SVERSION: *pdata = synic->version; break; case HV_X64_MSR_SIEFP: *pdata = synic->evt_page; break; case HV_X64_MSR_SIMP: *pdata = synic->msg_page; break; case HV_X64_MSR_EOM: *pdata = 0; break; case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: *pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]); break; default: ret = 1; break; } return ret; } static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_lapic_irq irq; int ret, vector; if (KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) return -EINVAL; if (sint >= ARRAY_SIZE(synic->sint)) return -EINVAL; vector = synic_get_sint_vector(synic_read_sint(synic, sint)); if (vector < 0) return -ENOENT; memset(&irq, 0, sizeof(irq)); irq.shorthand = APIC_DEST_SELF; irq.dest_mode = APIC_DEST_PHYSICAL; irq.delivery_mode = APIC_DM_FIXED; irq.vector = vector; irq.level = 1; ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL); trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret); return ret; } int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) { struct kvm_vcpu_hv_synic *synic; if (!level) return -1; synic = synic_get(kvm, e->hv_sint.vcpu); if (!synic) return -EINVAL; return synic_set_irq(synic, e->hv_sint.sint); } void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) { struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); int i; trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector); for (i = 0; i < ARRAY_SIZE(synic->sint); i++) if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) kvm_hv_notify_acked_sint(vcpu, i); } static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi) { struct kvm_vcpu_hv_synic *synic; synic = synic_get(kvm, vpidx); if (!synic) return -EINVAL; if (sint >= ARRAY_SIZE(synic->sint_to_gsi)) return -EINVAL; atomic_set(&synic->sint_to_gsi[sint], gsi); return 0; } void kvm_hv_irq_routing_update(struct kvm *kvm) { struct kvm_irq_routing_table *irq_rt; struct kvm_kernel_irq_routing_entry *e; u32 gsi; irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu, lockdep_is_held(&kvm->irq_lock)); for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) { hlist_for_each_entry(e, &irq_rt->map[gsi], link) { if (e->type == KVM_IRQ_ROUTING_HV_SINT) kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu, e->hv_sint.sint, gsi); } } } static void synic_init(struct kvm_vcpu_hv_synic *synic) { int i; memset(synic, 0, sizeof(*synic)); synic->version = HV_SYNIC_VERSION_1; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED); atomic_set(&synic->sint_to_gsi[i], -1); } } static u64 get_time_ref_counter(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); struct kvm_vcpu *vcpu; u64 tsc; /* * Fall back to get_kvmclock_ns() when TSC page hasn't been set up, * is broken, disabled or being updated. */ if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET) return div_u64(get_kvmclock_ns(kvm), 100); vcpu = kvm_get_vcpu(kvm, 0); tsc = kvm_read_l1_tsc(vcpu, rdtsc()); return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64) + hv->tsc_ref.tsc_offset; } static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); set_bit(stimer->index, to_hv_vcpu(vcpu)->stimer_pending_bitmap); kvm_make_request(KVM_REQ_HV_STIMER, vcpu); if (vcpu_kick) kvm_vcpu_kick(vcpu); } static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); trace_kvm_hv_stimer_cleanup(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index); hrtimer_cancel(&stimer->timer); clear_bit(stimer->index, to_hv_vcpu(vcpu)->stimer_pending_bitmap); stimer->msg_pending = false; stimer->exp_time = 0; } static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer) { struct kvm_vcpu_hv_stimer *stimer; stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer); trace_kvm_hv_stimer_callback(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index); stimer_mark_pending(stimer, true); return HRTIMER_NORESTART; } /* * stimer_start() assumptions: * a) stimer->count is not equal to 0 * b) stimer->config has HV_STIMER_ENABLE flag */ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) { u64 time_now; ktime_t ktime_now; time_now = get_time_ref_counter(hv_stimer_to_vcpu(stimer)->kvm); ktime_now = ktime_get(); if (stimer->config.periodic) { if (stimer->exp_time) { if (time_now >= stimer->exp_time) { u64 remainder; div64_u64_rem(time_now - stimer->exp_time, stimer->count, &remainder); stimer->exp_time = time_now + (stimer->count - remainder); } } else stimer->exp_time = time_now + stimer->count; trace_kvm_hv_stimer_start_periodic( hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, time_now, stimer->exp_time); hrtimer_start(&stimer->timer, ktime_add_ns(ktime_now, 100 * (stimer->exp_time - time_now)), HRTIMER_MODE_ABS); return 0; } stimer->exp_time = stimer->count; if (time_now >= stimer->count) { /* * Expire timer according to Hypervisor Top-Level Functional * specification v4(15.3.1): * "If a one shot is enabled and the specified count is in * the past, it will expire immediately." */ stimer_mark_pending(stimer, false); return 0; } trace_kvm_hv_stimer_start_one_shot(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, time_now, stimer->count); hrtimer_start(&stimer->timer, ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)), HRTIMER_MODE_ABS); return 0; } static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, bool host) { union hv_stimer_config new_config = {.as_uint64 = config}, old_config = {.as_uint64 = stimer->config.as_uint64}; struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); if (!synic->active && (!host || config)) return 1; if (unlikely(!host && hv_vcpu->enforce_cpuid && new_config.direct_mode && !(hv_vcpu->cpuid_cache.features_edx & HV_STIMER_DIRECT_MODE_AVAILABLE))) return 1; trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); stimer_cleanup(stimer); if (old_config.enable && !new_config.direct_mode && new_config.sintx == 0) new_config.enable = 0; stimer->config.as_uint64 = new_config.as_uint64; if (stimer->config.enable) stimer_mark_pending(stimer, false); return 0; } static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, bool host) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); if (!synic->active && (!host || count)) return 1; trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, count, host); stimer_cleanup(stimer); stimer->count = count; if (!host) { if (stimer->count == 0) stimer->config.enable = 0; else if (stimer->config.auto_enable) stimer->config.enable = 1; } if (stimer->config.enable) stimer_mark_pending(stimer, false); return 0; } static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig) { *pconfig = stimer->config.as_uint64; return 0; } static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount) { *pcount = stimer->count; return 0; } static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, struct hv_message *src_msg, bool no_retry) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int msg_off = offsetof(struct hv_message_page, sint_message[sint]); gfn_t msg_page_gfn; struct hv_message_header hv_hdr; int r; if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE)) return -ENOENT; msg_page_gfn = synic->msg_page >> PAGE_SHIFT; /* * Strictly following the spec-mandated ordering would assume setting * .msg_pending before checking .message_type. However, this function * is only called in vcpu context so the entire update is atomic from * guest POV and thus the exact order here doesn't matter. */ r = kvm_vcpu_read_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_type, msg_off + offsetof(struct hv_message, header.message_type), sizeof(hv_hdr.message_type)); if (r < 0) return r; if (hv_hdr.message_type != HVMSG_NONE) { if (no_retry) return 0; hv_hdr.message_flags.msg_pending = 1; r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_flags, msg_off + offsetof(struct hv_message, header.message_flags), sizeof(hv_hdr.message_flags)); if (r < 0) return r; return -EAGAIN; } r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, src_msg, msg_off, sizeof(src_msg->header) + src_msg->header.payload_size); if (r < 0) return r; r = synic_set_irq(synic, sint); if (r < 0) return r; if (r == 0) return -EFAULT; return 0; } static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct hv_message *msg = &stimer->msg; struct hv_timer_message_payload *payload = (struct hv_timer_message_payload *)&msg->u.payload; /* * To avoid piling up periodic ticks, don't retry message * delivery for them (within "lazy" lost ticks policy). */ bool no_retry = stimer->config.periodic; payload->expiration_time = stimer->exp_time; payload->delivery_time = get_time_ref_counter(vcpu->kvm); return synic_deliver_msg(to_hv_synic(vcpu), stimer->config.sintx, msg, no_retry); } static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, .vector = stimer->config.apic_vector }; if (lapic_in_kernel(vcpu)) return !kvm_apic_set_irq(vcpu, &irq, NULL); return 0; } static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) { int r, direct = stimer->config.direct_mode; stimer->msg_pending = true; if (!direct) r = stimer_send_msg(stimer); else r = stimer_notify_direct(stimer); trace_kvm_hv_stimer_expiration(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, direct, r); if (!r) { stimer->msg_pending = false; if (!(stimer->config.periodic)) stimer->config.enable = 0; } } void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; u64 time_now, exp_time; int i; if (!hv_vcpu) return; for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { stimer = &hv_vcpu->stimer[i]; if (stimer->config.enable) { exp_time = stimer->exp_time; if (exp_time) { time_now = get_time_ref_counter(vcpu->kvm); if (time_now >= exp_time) stimer_expiration(stimer); } if ((stimer->config.enable) && stimer->count) { if (!stimer->msg_pending) stimer_start(stimer); } else stimer_cleanup(stimer); } } } void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); int i; if (!hv_vcpu) return; for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_cleanup(&hv_vcpu->stimer[i]); kfree(hv_vcpu); vcpu->arch.hyperv = NULL; } bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (!hv_vcpu) return false; if (!(hv_vcpu->hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) return false; return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; } EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled); int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu)) return -EFAULT; return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page)); } EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page); static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer) { struct hv_message *msg = &stimer->msg; struct hv_timer_message_payload *payload = (struct hv_timer_message_payload *)&msg->u.payload; memset(&msg->header, 0, sizeof(msg->header)); msg->header.message_type = HVMSG_TIMER_EXPIRED; msg->header.payload_size = sizeof(*payload); payload->timer_index = stimer->index; payload->expiration_time = 0; payload->delivery_time = 0; } static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) { memset(stimer, 0, sizeof(*stimer)); stimer->index = timer_index; hrtimer_setup(&stimer->timer, stimer_timer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); stimer_prepare_msg(stimer); } int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); int i; if (hv_vcpu) return 0; hv_vcpu = kzalloc(sizeof(struct kvm_vcpu_hv), GFP_KERNEL_ACCOUNT); if (!hv_vcpu) return -ENOMEM; vcpu->arch.hyperv = hv_vcpu; hv_vcpu->vcpu = vcpu; synic_init(&hv_vcpu->synic); bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_init(&hv_vcpu->stimer[i], i); hv_vcpu->vp_index = vcpu->vcpu_idx; for (i = 0; i < HV_NR_TLB_FLUSH_FIFOS; i++) { INIT_KFIFO(hv_vcpu->tlb_flush_fifo[i].entries); spin_lock_init(&hv_vcpu->tlb_flush_fifo[i].write_lock); } return 0; } int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) { struct kvm_vcpu_hv_synic *synic; int r; r = kvm_hv_vcpu_init(vcpu); if (r) return r; synic = to_hv_synic(vcpu); synic->active = true; synic->dont_zero_synic_pages = dont_zero_synic_pages; synic->control = HV_SYNIC_CONTROL_ENABLE; return 0; } static bool kvm_hv_msr_partition_wide(u32 msr) { bool r = false; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: case HV_X64_MSR_HYPERCALL: case HV_X64_MSR_REFERENCE_TSC: case HV_X64_MSR_TIME_REF_COUNT: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_RESET: case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: case HV_X64_MSR_TSC_INVARIANT_CONTROL: case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: r = true; break; } return r; } static int kvm_hv_msr_get_crash_data(struct kvm *kvm, u32 index, u64 *pdata) { struct kvm_hv *hv = to_kvm_hv(kvm); size_t size = ARRAY_SIZE(hv->hv_crash_param); if (WARN_ON_ONCE(index >= size)) return -EINVAL; *pdata = hv->hv_crash_param[array_index_nospec(index, size)]; return 0; } static int kvm_hv_msr_get_crash_ctl(struct kvm *kvm, u64 *pdata) { struct kvm_hv *hv = to_kvm_hv(kvm); *pdata = hv->hv_crash_ctl; return 0; } static int kvm_hv_msr_set_crash_ctl(struct kvm *kvm, u64 data) { struct kvm_hv *hv = to_kvm_hv(kvm); hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY; return 0; } static int kvm_hv_msr_set_crash_data(struct kvm *kvm, u32 index, u64 data) { struct kvm_hv *hv = to_kvm_hv(kvm); size_t size = ARRAY_SIZE(hv->hv_crash_param); if (WARN_ON_ONCE(index >= size)) return -EINVAL; hv->hv_crash_param[array_index_nospec(index, size)] = data; return 0; } /* * The kvmclock and Hyper-V TSC page use similar formulas, and converting * between them is possible: * * kvmclock formula: * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32) * + system_time * * Hyper-V formula: * nsec/100 = ticks * scale / 2^64 + offset * * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula. * By dividing the kvmclock formula by 100 and equating what's left we get: * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100 * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100 * * Now expand the kvmclock formula and divide by 100: * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32) * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) * + system_time * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100 * + system_time / 100 * * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64: * nsec/100 = ticks * scale / 2^64 * - tsc_timestamp * scale / 2^64 * + system_time / 100 * * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out: * offset = system_time / 100 - tsc_timestamp * scale / 2^64 * * These two equivalencies are implemented in this function. */ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, struct ms_hyperv_tsc_page *tsc_ref) { u64 max_mul; if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT)) return false; /* * check if scale would overflow, if so we use the time ref counter * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64 * tsc_to_system_mul / 100 >= 2^(32-tsc_shift) * tsc_to_system_mul >= 100 * 2^(32-tsc_shift) */ max_mul = 100ull << (32 - hv_clock->tsc_shift); if (hv_clock->tsc_to_system_mul >= max_mul) return false; /* * Otherwise compute the scale and offset according to the formulas * derived above. */ tsc_ref->tsc_scale = mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift), hv_clock->tsc_to_system_mul, 100); tsc_ref->tsc_offset = hv_clock->system_time; do_div(tsc_ref->tsc_offset, 100); tsc_ref->tsc_offset -= mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64); return true; } /* * Don't touch TSC page values if the guest has opted for TSC emulation after * migration. KVM doesn't fully support reenlightenment notifications and TSC * access emulation and Hyper-V is known to expect the values in TSC page to * stay constant before TSC access emulation is disabled from guest side * (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC * frequency and guest visible TSC value across migration (and prevent it when * TSC scaling is unsupported). */ static inline bool tsc_page_update_unsafe(struct kvm_hv *hv) { return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) && hv->hv_tsc_emulation_control; } void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock) { struct kvm_hv *hv = to_kvm_hv(kvm); u32 tsc_seq; u64 gfn; BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); mutex_lock(&hv->hv_lock); if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || hv->hv_tsc_page_status == HV_TSC_PAGE_SET || hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET) goto out_unlock; if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) goto out_unlock; gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; /* * Because the TSC parameters only vary when there is a * change in the master clock, do not bother with caching. */ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq)))) goto out_err; if (tsc_seq && tsc_page_update_unsafe(hv)) { if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; goto out_unlock; } /* * While we're computing and writing the parameters, force the * guest to use the time reference count MSR. */ hv->tsc_ref.tsc_sequence = 0; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) goto out_err; if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) goto out_err; /* Ensure sequence is zero before writing the rest of the struct. */ smp_wmb(); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) goto out_err; /* * Now switch to the TSC page mechanism by writing the sequence. */ tsc_seq++; if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0) tsc_seq = 1; /* Write the struct entirely before the non-zero sequence. */ smp_wmb(); hv->tsc_ref.tsc_sequence = tsc_seq; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; goto out_unlock; out_err: hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; out_unlock: mutex_unlock(&hv->hv_lock); } void kvm_hv_request_tsc_page_update(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); mutex_lock(&hv->hv_lock); if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET && !tsc_page_update_unsafe(hv)) hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED; mutex_unlock(&hv->hv_lock); } static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) { if (!hv_vcpu->enforce_cpuid) return true; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: case HV_X64_MSR_HYPERCALL: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_HYPERCALL_AVAILABLE; case HV_X64_MSR_VP_RUNTIME: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_VP_RUNTIME_AVAILABLE; case HV_X64_MSR_TIME_REF_COUNT: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_TIME_REF_COUNT_AVAILABLE; case HV_X64_MSR_VP_INDEX: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_VP_INDEX_AVAILABLE; case HV_X64_MSR_RESET: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_RESET_AVAILABLE; case HV_X64_MSR_REFERENCE_TSC: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_REFERENCE_TSC_AVAILABLE; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: case HV_X64_MSR_SIEFP: case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_SYNIC_AVAILABLE; case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: case HV_X64_MSR_STIMER0_COUNT: case HV_X64_MSR_STIMER1_COUNT: case HV_X64_MSR_STIMER2_COUNT: case HV_X64_MSR_STIMER3_COUNT: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_SYNTIMER_AVAILABLE; case HV_X64_MSR_EOI: case HV_X64_MSR_ICR: case HV_X64_MSR_TPR: case HV_X64_MSR_VP_ASSIST_PAGE: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_APIC_ACCESS_AVAILABLE; case HV_X64_MSR_TSC_FREQUENCY: case HV_X64_MSR_APIC_FREQUENCY: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_FREQUENCY_MSRS; case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_REENLIGHTENMENT; case HV_X64_MSR_TSC_INVARIANT_CONTROL: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_TSC_INVARIANT; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: return hv_vcpu->cpuid_cache.features_edx & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return hv_vcpu->cpuid_cache.features_edx & HV_FEATURE_DEBUG_MSRS_AVAILABLE; default: break; } return false; } #define KVM_HV_WIN2016_GUEST_ID 0x1040a00003839 #define KVM_HV_WIN2016_GUEST_ID_MASK (~GENMASK_ULL(23, 16)) /* mask out the service version */ /* * Hyper-V enabled Windows Server 2016 SMP VMs fail to boot in !XSAVES && XSAVEC * configuration. * Such configuration can result from, for example, AMD Erratum 1386 workaround. * * Print a notice so users aren't left wondering what's suddenly gone wrong. */ static void __kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; struct kvm_hv *hv = to_kvm_hv(kvm); /* Check again under the hv_lock. */ if (hv->xsaves_xsavec_checked) return; if ((hv->hv_guest_os_id & KVM_HV_WIN2016_GUEST_ID_MASK) != KVM_HV_WIN2016_GUEST_ID) return; hv->xsaves_xsavec_checked = true; /* UP configurations aren't affected */ if (atomic_read(&kvm->online_vcpus) < 2) return; if (guest_cpuid_has(vcpu, X86_FEATURE_XSAVES) || !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVEC)) return; pr_notice_ratelimited("Booting SMP Windows KVM VM with !XSAVES && XSAVEC. " "If it fails to boot try disabling XSAVEC in the VM config.\n"); } void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (!vcpu->arch.hyperv_enabled || hv->xsaves_xsavec_checked) return; mutex_lock(&hv->hv_lock); __kvm_hv_xsaves_xsavec_maybe_warn(vcpu); mutex_unlock(&hv->hv_lock); } static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm *kvm = vcpu->kvm; struct kvm_hv *hv = to_kvm_hv(kvm); if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr))) return 1; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: hv->hv_guest_os_id = data; /* setting guest os id to zero disables hypercall page */ if (!hv->hv_guest_os_id) hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; break; case HV_X64_MSR_HYPERCALL: { u8 instructions[9]; int i = 0; u64 addr; /* if guest os id is not set hypercall should remain disabled */ if (!hv->hv_guest_os_id) break; if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { hv->hv_hypercall = data; break; } /* * If Xen and Hyper-V hypercalls are both enabled, disambiguate * the same way Xen itself does, by setting the bit 31 of EAX * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just * going to be clobbered on 64-bit. */ if (kvm_xen_hypercall_enabled(kvm)) { /* orl $0x80000000, %eax */ instructions[i++] = 0x0d; instructions[i++] = 0x00; instructions[i++] = 0x00; instructions[i++] = 0x00; instructions[i++] = 0x80; } /* vmcall/vmmcall */ kvm_x86_call(patch_hypercall)(vcpu, instructions + i); i += 3; /* ret */ ((unsigned char *)instructions)[i++] = 0xc3; addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK; if (kvm_vcpu_write_guest(vcpu, addr, instructions, i)) return 1; hv->hv_hypercall = data; break; } case HV_X64_MSR_REFERENCE_TSC: hv->hv_tsc_page = data; if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) { if (!host) hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED; else hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED; kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); } else { hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET; } break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: return kvm_hv_msr_set_crash_data(kvm, msr - HV_X64_MSR_CRASH_P0, data); case HV_X64_MSR_CRASH_CTL: if (host) return kvm_hv_msr_set_crash_ctl(kvm, data); if (data & HV_CRASH_CTL_CRASH_NOTIFY) { vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", hv->hv_crash_param[0], hv->hv_crash_param[1], hv->hv_crash_param[2], hv->hv_crash_param[3], hv->hv_crash_param[4]); /* Send notification about crash to user space */ kvm_make_request(KVM_REQ_HV_CRASH, vcpu); } break; case HV_X64_MSR_RESET: if (data == 1) { vcpu_debug(vcpu, "hyper-v reset requested\n"); kvm_make_request(KVM_REQ_HV_RESET, vcpu); } break; case HV_X64_MSR_REENLIGHTENMENT_CONTROL: hv->hv_reenlightenment_control = data; break; case HV_X64_MSR_TSC_EMULATION_CONTROL: hv->hv_tsc_emulation_control = data; break; case HV_X64_MSR_TSC_EMULATION_STATUS: if (data && !host) return 1; hv->hv_tsc_emulation_status = data; break; case HV_X64_MSR_TIME_REF_COUNT: /* read-only, but still ignore it if host-initiated */ if (!host) return 1; break; case HV_X64_MSR_TSC_INVARIANT_CONTROL: /* Only bit 0 is supported */ if (data & ~HV_EXPOSE_INVARIANT_TSC) return 1; /* The feature can't be disabled from the guest */ if (!host && hv->hv_invtsc_control && !data) return 1; hv->hv_invtsc_control = data; break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_set_msr(vcpu, msr, data, host); default: kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; } /* Calculate cpu time spent by current task in 100ns units */ static u64 current_task_runtime_100ns(void) { u64 utime, stime; task_cputime_adjusted(current, &utime, &stime); return div_u64(utime + stime, 100); } static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr))) return 1; switch (msr) { case HV_X64_MSR_VP_INDEX: { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); u32 new_vp_index = (u32)data; if (!host || new_vp_index >= KVM_MAX_VCPUS) return 1; if (new_vp_index == hv_vcpu->vp_index) return 0; /* * The VP index is initialized to vcpu_index by * kvm_hv_vcpu_postcreate so they initially match. Now the * VP index is changing, adjust num_mismatched_vp_indexes if * it now matches or no longer matches vcpu_idx. */ if (hv_vcpu->vp_index == vcpu->vcpu_idx) atomic_inc(&hv->num_mismatched_vp_indexes); else if (new_vp_index == vcpu->vcpu_idx) atomic_dec(&hv->num_mismatched_vp_indexes); hv_vcpu->vp_index = new_vp_index; break; } case HV_X64_MSR_VP_ASSIST_PAGE: { u64 gfn; unsigned long addr; if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) { hv_vcpu->hv_vapic = data; if (kvm_lapic_set_pv_eoi(vcpu, 0, 0)) return 1; break; } gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT; addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); if (kvm_is_error_hva(addr)) return 1; /* * Clear apic_assist portion of struct hv_vp_assist_page * only, there can be valuable data in the rest which needs * to be preserved e.g. on migration. */ if (__put_user(0, (u32 __user *)addr)) return 1; hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); if (kvm_lapic_set_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED, sizeof(struct hv_vp_assist_page))) return 1; break; } case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); case HV_X64_MSR_ICR: return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); case HV_X64_MSR_VP_RUNTIME: if (!host) return 1; hv_vcpu->runtime_offset = data - current_task_runtime_100ns(); break; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: case HV_X64_MSR_SIEFP: case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: return synic_set_msr(to_hv_synic(vcpu), msr, data, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: { int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; return stimer_set_config(to_hv_stimer(vcpu, timer_index), data, host); } case HV_X64_MSR_STIMER0_COUNT: case HV_X64_MSR_STIMER1_COUNT: case HV_X64_MSR_STIMER2_COUNT: case HV_X64_MSR_STIMER3_COUNT: { int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; return stimer_set_count(to_hv_stimer(vcpu, timer_index), data, host); } case HV_X64_MSR_TSC_FREQUENCY: case HV_X64_MSR_APIC_FREQUENCY: /* read-only, but still ignore it if host-initiated */ if (!host) return 1; break; default: kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; } static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data = 0; struct kvm *kvm = vcpu->kvm; struct kvm_hv *hv = to_kvm_hv(kvm); if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr))) return 1; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: data = hv->hv_guest_os_id; break; case HV_X64_MSR_HYPERCALL: data = hv->hv_hypercall; break; case HV_X64_MSR_TIME_REF_COUNT: data = get_time_ref_counter(kvm); break; case HV_X64_MSR_REFERENCE_TSC: data = hv->hv_tsc_page; break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: return kvm_hv_msr_get_crash_data(kvm, msr - HV_X64_MSR_CRASH_P0, pdata); case HV_X64_MSR_CRASH_CTL: return kvm_hv_msr_get_crash_ctl(kvm, pdata); case HV_X64_MSR_RESET: data = 0; break; case HV_X64_MSR_REENLIGHTENMENT_CONTROL: data = hv->hv_reenlightenment_control; break; case HV_X64_MSR_TSC_EMULATION_CONTROL: data = hv->hv_tsc_emulation_control; break; case HV_X64_MSR_TSC_EMULATION_STATUS: data = hv->hv_tsc_emulation_status; break; case HV_X64_MSR_TSC_INVARIANT_CONTROL: data = hv->hv_invtsc_control; break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_get_msr(vcpu, msr, pdata, host); default: kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; return 0; } static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data = 0; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr))) return 1; switch (msr) { case HV_X64_MSR_VP_INDEX: data = hv_vcpu->vp_index; break; case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); case HV_X64_MSR_ICR: return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); case HV_X64_MSR_VP_ASSIST_PAGE: data = hv_vcpu->hv_vapic; break; case HV_X64_MSR_VP_RUNTIME: data = current_task_runtime_100ns() + hv_vcpu->runtime_offset; break; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: case HV_X64_MSR_SIEFP: case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: return synic_get_msr(to_hv_synic(vcpu), msr, pdata, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: { int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; return stimer_get_config(to_hv_stimer(vcpu, timer_index), pdata); } case HV_X64_MSR_STIMER0_COUNT: case HV_X64_MSR_STIMER1_COUNT: case HV_X64_MSR_STIMER2_COUNT: case HV_X64_MSR_STIMER3_COUNT: { int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; return stimer_get_count(to_hv_stimer(vcpu, timer_index), pdata); } case HV_X64_MSR_TSC_FREQUENCY: data = (u64)vcpu->arch.virtual_tsc_khz * 1000; break; case HV_X64_MSR_APIC_FREQUENCY: data = div64_u64(1000000000ULL, vcpu->kvm->arch.apic_bus_cycle_ns); break; default: kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; return 0; } int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (!host && !vcpu->arch.hyperv_enabled) return 1; if (kvm_hv_vcpu_init(vcpu)) return 1; if (kvm_hv_msr_partition_wide(msr)) { int r; mutex_lock(&hv->hv_lock); r = kvm_hv_set_msr_pw(vcpu, msr, data, host); mutex_unlock(&hv->hv_lock); return r; } else return kvm_hv_set_msr(vcpu, msr, data, host); } int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (!host && !vcpu->arch.hyperv_enabled) return 1; if (kvm_hv_vcpu_init(vcpu)) return 1; if (kvm_hv_msr_partition_wide(msr)) { int r; mutex_lock(&hv->hv_lock); r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host); mutex_unlock(&hv->hv_lock); return r; } else return kvm_hv_get_msr(vcpu, msr, pdata, host); } static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask, unsigned long *vcpu_mask) { struct kvm_hv *hv = to_kvm_hv(kvm); bool has_mismatch = atomic_read(&hv->num_mismatched_vp_indexes); u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; struct kvm_vcpu *vcpu; int bank, sbank = 0; unsigned long i; u64 *bitmap; BUILD_BUG_ON(sizeof(vp_bitmap) > sizeof(*vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS)); /* * If vp_index == vcpu_idx for all vCPUs, fill vcpu_mask directly, else * fill a temporary buffer and manually test each vCPU's VP index. */ if (likely(!has_mismatch)) bitmap = (u64 *)vcpu_mask; else bitmap = vp_bitmap; /* * Each set of 64 VPs is packed into sparse_banks, with valid_bank_mask * having a '1' for each bank that exists in sparse_banks. Sets must * be in ascending order, i.e. bank0..bankN. */ memset(bitmap, 0, sizeof(vp_bitmap)); for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, KVM_HV_MAX_SPARSE_VCPU_SET_BITS) bitmap[bank] = sparse_banks[sbank++]; if (likely(!has_mismatch)) return; bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, vcpu, kvm) { if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap)) __set_bit(i, vcpu_mask); } } static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_banks[]) { int valid_bit_nr = vp_id / HV_VCPUS_PER_SPARSE_BANK; unsigned long sbank; if (!test_bit(valid_bit_nr, (unsigned long *)&valid_bank_mask)) return false; /* * The index into the sparse bank is the number of preceding bits in * the valid mask. Optimize for VMs with <64 vCPUs by skipping the * fancy math if there can't possibly be preceding bits. */ if (valid_bit_nr) sbank = hweight64(valid_bank_mask & GENMASK_ULL(valid_bit_nr - 1, 0)); else sbank = 0; return test_bit(vp_id % HV_VCPUS_PER_SPARSE_BANK, (unsigned long *)&sparse_banks[sbank]); } struct kvm_hv_hcall { /* Hypercall input data */ u64 param; u64 ingpa; u64 outgpa; u16 code; u16 var_cnt; u16 rep_cnt; u16 rep_idx; bool fast; bool rep; sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS]; /* * Current read offset when KVM reads hypercall input data gradually, * either offset in bytes from 'ingpa' for regular hypercalls or the * number of already consumed 'XMM halves' for 'fast' hypercalls. */ union { gpa_t data_offset; int consumed_xmm_halves; }; }; static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc, u16 orig_cnt, u16 cnt_cap, u64 *data) { /* * Preserve the original count when ignoring entries via a "cap", KVM * still needs to validate the guest input (though the non-XMM path * punts on the checks). */ u16 cnt = min(orig_cnt, cnt_cap); int i, j; if (hc->fast) { /* * Each XMM holds two sparse banks, but do not count halves that * have already been consumed for hypercall parameters. */ if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - hc->consumed_xmm_halves) return HV_STATUS_INVALID_HYPERCALL_INPUT; for (i = 0; i < cnt; i++) { j = i + hc->consumed_xmm_halves; if (j % 2) data[i] = sse128_hi(hc->xmm[j / 2]); else data[i] = sse128_lo(hc->xmm[j / 2]); } return 0; } return kvm_read_guest(kvm, hc->ingpa + hc->data_offset, data, cnt * sizeof(*data)); } static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 *sparse_banks) { if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS) return -EINVAL; /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */ return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS, sparse_banks); } static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[]) { return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt, entries); } static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo, u64 *entries, int count) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY; if (!hv_vcpu) return; spin_lock(&tlb_flush_fifo->write_lock); /* * All entries should fit on the fifo leaving one free for 'flush all' * entry in case another request comes in. In case there's not enough * space, just put 'flush all' entry there. */ if (count && entries && count < kfifo_avail(&tlb_flush_fifo->entries)) { WARN_ON(kfifo_in(&tlb_flush_fifo->entries, entries, count) != count); goto out_unlock; } /* * Note: full fifo always contains 'flush all' entry, no need to check the * return value. */ kfifo_in(&tlb_flush_fifo->entries, &flush_all_entry, 1); out_unlock: spin_unlock(&tlb_flush_fifo->write_lock); } int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); u64 entries[KVM_HV_TLB_FLUSH_FIFO_SIZE]; int i, j, count; gva_t gva; if (!tdp_enabled || !hv_vcpu) return -EINVAL; tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu)); count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE); for (i = 0; i < count; i++) { if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY) goto out_flush_all; if (is_noncanonical_invlpg_address(entries[i], vcpu)) continue; /* * Lower 12 bits of 'address' encode the number of additional * pages to flush. */ gva = entries[i] & PAGE_MASK; for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++) kvm_x86_call(flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE); ++vcpu->stat.tlb_flush; } return 0; out_flush_all: kfifo_reset_out(&tlb_flush_fifo->entries); /* Fall back to full flush. */ return -ENOSPC; } static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); unsigned long *vcpu_mask = hv_vcpu->vcpu_mask; u64 *sparse_banks = hv_vcpu->sparse_banks; struct kvm *kvm = vcpu->kvm; struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; /* * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE' * entries on the TLB flush fifo. The last entry, however, needs to be * always left free for 'flush all' entry which gets placed when * there is not enough space to put all the requested entries. */ u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1]; u64 *tlb_flush_entries; u64 valid_bank_mask; struct kvm_vcpu *v; unsigned long i; bool all_cpus; /* * The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS * sparse banks. Fail the build if KVM's max allowed number of * vCPUs (>4096) exceeds this limit. */ BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > HV_MAX_SPARSE_VCPU_BANKS); /* * 'Slow' hypercall's first parameter is the address in guest's memory * where hypercall parameters are placed. This is either a GPA or a * nested GPA when KVM is handling the call from L2 ('direct' TLB * flush). Translate the address here so the memory can be uniformly * read with kvm_read_guest(). */ if (!hc->fast && is_guest_mode(vcpu)) { hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL); if (unlikely(hc->ingpa == INVALID_GPA)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST || hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) { if (hc->fast) { flush.address_space = hc->ingpa; flush.flags = hc->outgpa; flush.processor_mask = sse128_lo(hc->xmm[0]); hc->consumed_xmm_halves = 1; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush, sizeof(flush)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; hc->data_offset = sizeof(flush); } trace_kvm_hv_flush_tlb(flush.processor_mask, flush.address_space, flush.flags, is_guest_mode(vcpu)); valid_bank_mask = BIT_ULL(0); sparse_banks[0] = flush.processor_mask; /* * Work around possible WS2012 bug: it sends hypercalls * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear, * while also expecting us to flush something and crashing if * we don't. Let's treat processor_mask == 0 same as * HV_FLUSH_ALL_PROCESSORS. */ all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) || flush.processor_mask == 0; } else { if (hc->fast) { flush_ex.address_space = hc->ingpa; flush_ex.flags = hc->outgpa; memcpy(&flush_ex.hv_vp_set, &hc->xmm[0], sizeof(hc->xmm[0])); hc->consumed_xmm_halves = 2; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex, sizeof(flush_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; hc->data_offset = sizeof(flush_ex); } trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask, flush_ex.hv_vp_set.format, flush_ex.address_space, flush_ex.flags, is_guest_mode(vcpu)); valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask; all_cpus = flush_ex.hv_vp_set.format != HV_GENERIC_SET_SPARSE_4K; if (hc->var_cnt != hweight64(valid_bank_mask)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (!all_cpus) { if (!hc->var_cnt) goto ret_success; if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } /* * Hyper-V TLFS doesn't explicitly forbid non-empty sparse vCPU * banks (and, thus, non-zero 'var_cnt') for the 'all vCPUs' * case (HV_GENERIC_SET_ALL). Always adjust data_offset and * consumed_xmm_halves to make sure TLB flush entries are read * from the correct offset. */ if (hc->fast) hc->consumed_xmm_halves += hc->var_cnt; else hc->data_offset += hc->var_cnt * sizeof(sparse_banks[0]); } if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE || hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX || hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) { tlb_flush_entries = NULL; } else { if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries)) return HV_STATUS_INVALID_HYPERCALL_INPUT; tlb_flush_entries = __tlb_flush_entries; } /* * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ if (all_cpus && !is_guest_mode(vcpu)) { kvm_for_each_vcpu(i, v, kvm) { tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); hv_tlb_flush_enqueue(v, tlb_flush_fifo, tlb_flush_entries, hc->rep_cnt); } kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH); } else if (!is_guest_mode(vcpu)) { sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) { v = kvm_get_vcpu(kvm, i); if (!v) continue; tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); hv_tlb_flush_enqueue(v, tlb_flush_fifo, tlb_flush_entries, hc->rep_cnt); } kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); } else { struct kvm_vcpu_hv *hv_v; bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, v, kvm) { hv_v = to_hv_vcpu(v); /* * The following check races with nested vCPUs entering/exiting * and/or migrating between L1's vCPUs, however the only case when * KVM *must* flush the TLB is when the target L2 vCPU keeps * running on the same L1 vCPU from the moment of the request until * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other * cases, e.g. when the target L2 vCPU migrates to a different L1 * vCPU or when the corresponding L1 vCPU temporary switches to a * different L2 vCPU while the request is being processed. */ if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id) continue; if (!all_cpus && !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask, sparse_banks)) continue; __set_bit(i, vcpu_mask); tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true); hv_tlb_flush_enqueue(v, tlb_flush_fifo, tlb_flush_entries, hc->rep_cnt); } kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); } ret_success: /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */ return (u64)HV_STATUS_SUCCESS | ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); } static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector, u64 *sparse_banks, u64 valid_bank_mask) { struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, .vector = vector }; struct kvm_vcpu *vcpu; unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { if (sparse_banks && !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu), valid_bank_mask, sparse_banks)) continue; /* We fail only when APIC is disabled */ kvm_apic_set_irq(vcpu, &irq, NULL); } } static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); u64 *sparse_banks = hv_vcpu->sparse_banks; struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; u64 valid_bank_mask; u32 vector; bool all_cpus; if (!lapic_in_kernel(vcpu)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (hc->code == HVCALL_SEND_IPI) { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, sizeof(send_ipi)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; sparse_banks[0] = send_ipi.cpu_mask; vector = send_ipi.vector; } else { /* 'reserved' part of hv_send_ipi should be 0 */ if (unlikely(hc->ingpa >> 32 != 0)) return HV_STATUS_INVALID_HYPERCALL_INPUT; sparse_banks[0] = hc->outgpa; vector = (u32)hc->ingpa; } all_cpus = false; valid_bank_mask = BIT_ULL(0); trace_kvm_hv_send_ipi(vector, sparse_banks[0]); } else { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex, sizeof(send_ipi_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; } else { send_ipi_ex.vector = (u32)hc->ingpa; send_ipi_ex.vp_set.format = hc->outgpa; send_ipi_ex.vp_set.valid_bank_mask = sse128_lo(hc->xmm[0]); } trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector, send_ipi_ex.vp_set.format, send_ipi_ex.vp_set.valid_bank_mask); vector = send_ipi_ex.vector; valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; if (hc->var_cnt != hweight64(valid_bank_mask)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (all_cpus) goto check_and_send_ipi; if (!hc->var_cnt) goto ret_success; if (!hc->fast) hc->data_offset = offsetof(struct hv_send_ipi_ex, vp_set.bank_contents); else hc->consumed_xmm_halves = 1; if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } check_and_send_ipi: if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (all_cpus) kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0); else kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask); ret_success: return HV_STATUS_SUCCESS; } void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_cpuid_entry2 *entry; vcpu->arch.hyperv_enabled = hyperv_enabled; if (!hv_vcpu) { /* * KVM should have already allocated kvm_vcpu_hv if Hyper-V is * enabled in CPUID. */ WARN_ON_ONCE(vcpu->arch.hyperv_enabled); return; } memset(&hv_vcpu->cpuid_cache, 0, sizeof(hv_vcpu->cpuid_cache)); if (!vcpu->arch.hyperv_enabled) return; entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); if (entry) { hv_vcpu->cpuid_cache.features_eax = entry->eax; hv_vcpu->cpuid_cache.features_ebx = entry->ebx; hv_vcpu->cpuid_cache.features_edx = entry->edx; } entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO); if (entry) { hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax; hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx; } entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); if (entry) hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax; entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_NESTED_FEATURES); if (entry) { hv_vcpu->cpuid_cache.nested_eax = entry->eax; hv_vcpu->cpuid_cache.nested_ebx = entry->ebx; } } int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce) { struct kvm_vcpu_hv *hv_vcpu; int ret = 0; if (!to_hv_vcpu(vcpu)) { if (enforce) { ret = kvm_hv_vcpu_init(vcpu); if (ret) return ret; } else { return 0; } } hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->enforce_cpuid = enforce; return ret; } static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) { bool longmode; longmode = is_64_bit_hypercall(vcpu); if (longmode) kvm_rax_write(vcpu, result); else { kvm_rdx_write(vcpu, result >> 32); kvm_rax_write(vcpu, result & 0xffffffff); } } static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) { u32 tlb_lock_count = 0; int ret; if (hv_result_success(result) && is_guest_mode(vcpu) && kvm_hv_is_tlb_flush_hcall(vcpu) && kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa, &tlb_lock_count, sizeof(tlb_lock_count))) result = HV_STATUS_INVALID_HYPERCALL_INPUT; trace_kvm_hv_hypercall_done(result); kvm_hv_hypercall_set_result(vcpu, result); ++vcpu->stat.hypercalls; ret = kvm_skip_emulated_instruction(vcpu); if (tlb_lock_count) kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu); return ret; } static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) { return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result); } static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); struct eventfd_ctx *eventfd; if (unlikely(!hc->fast)) { int ret; gpa_t gpa = hc->ingpa; if ((gpa & (__alignof__(hc->ingpa) - 1)) || offset_in_page(gpa) + sizeof(hc->ingpa) > PAGE_SIZE) return HV_STATUS_INVALID_ALIGNMENT; ret = kvm_vcpu_read_guest(vcpu, gpa, &hc->ingpa, sizeof(hc->ingpa)); if (ret < 0) return HV_STATUS_INVALID_ALIGNMENT; } /* * Per spec, bits 32-47 contain the extra "flag number". However, we * have no use for it, and in all known usecases it is zero, so just * report lookup failure if it isn't. */ if (hc->ingpa & 0xffff00000000ULL) return HV_STATUS_INVALID_PORT_ID; /* remaining bits are reserved-zero */ if (hc->ingpa & ~KVM_HYPERV_CONN_ID_MASK) return HV_STATUS_INVALID_HYPERCALL_INPUT; /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */ rcu_read_lock(); eventfd = idr_find(&hv->conn_to_evt, hc->ingpa); rcu_read_unlock(); if (!eventfd) return HV_STATUS_INVALID_PORT_ID; eventfd_signal(eventfd); return HV_STATUS_SUCCESS; } static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc) { switch (hc->code) { case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: case HVCALL_SEND_IPI_EX: return true; } return false; } static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc) { int reg; kvm_fpu_get(); for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++) _kvm_read_sse_reg(reg, &hc->xmm[reg]); kvm_fpu_put(); } static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) { if (!hv_vcpu->enforce_cpuid) return true; switch (code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: return hv_vcpu->cpuid_cache.enlightenments_ebx && hv_vcpu->cpuid_cache.enlightenments_ebx != U32_MAX; case HVCALL_POST_MESSAGE: return hv_vcpu->cpuid_cache.features_ebx & HV_POST_MESSAGES; case HVCALL_SIGNAL_EVENT: return hv_vcpu->cpuid_cache.features_ebx & HV_SIGNAL_EVENTS; case HVCALL_POST_DEBUG_DATA: case HVCALL_RETRIEVE_DEBUG_DATA: case HVCALL_RESET_DEBUG_SESSION: /* * Return 'true' when SynDBG is disabled so the resulting code * will be HV_STATUS_INVALID_HYPERCALL_CODE. */ return !kvm_hv_is_syndbg_enabled(hv_vcpu->vcpu) || hv_vcpu->cpuid_cache.features_ebx & HV_DEBUGGING; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: if (!(hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) return false; fallthrough; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: return hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; case HVCALL_SEND_IPI_EX: if (!(hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) return false; fallthrough; case HVCALL_SEND_IPI: return hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_CLUSTER_IPI_RECOMMENDED; case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: return hv_vcpu->cpuid_cache.features_ebx & HV_ENABLE_EXTENDED_HYPERCALLS; default: break; } return true; } int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_hv_hcall hc; u64 ret = HV_STATUS_SUCCESS; /* * hypercall generates UD from non zero cpl and real mode * per HYPER-V spec */ if (kvm_x86_call(get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } #ifdef CONFIG_X86_64 if (is_64_bit_hypercall(vcpu)) { hc.param = kvm_rcx_read(vcpu); hc.ingpa = kvm_rdx_read(vcpu); hc.outgpa = kvm_r8_read(vcpu); } else #endif { hc.param = ((u64)kvm_rdx_read(vcpu) << 32) | (kvm_rax_read(vcpu) & 0xffffffff); hc.ingpa = ((u64)kvm_rbx_read(vcpu) << 32) | (kvm_rcx_read(vcpu) & 0xffffffff); hc.outgpa = ((u64)kvm_rdi_read(vcpu) << 32) | (kvm_rsi_read(vcpu) & 0xffffffff); } hc.code = hc.param & 0xffff; hc.var_cnt = (hc.param & HV_HYPERCALL_VARHEAD_MASK) >> HV_HYPERCALL_VARHEAD_OFFSET; hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT); hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff; hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; hc.rep = !!(hc.rep_cnt || hc.rep_idx); trace_kvm_hv_hypercall(hc.code, hc.fast, hc.var_cnt, hc.rep_cnt, hc.rep_idx, hc.ingpa, hc.outgpa); if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) { ret = HV_STATUS_ACCESS_DENIED; goto hypercall_complete; } if (unlikely(hc.param & HV_HYPERCALL_RSVD_MASK)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; goto hypercall_complete; } if (hc.fast && is_xmm_fast_hypercall(&hc)) { if (unlikely(hv_vcpu->enforce_cpuid && !(hv_vcpu->cpuid_cache.features_edx & HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } kvm_hv_hypercall_read_xmm(&hc); } switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } kvm_vcpu_on_spin(vcpu, true); break; case HVCALL_SIGNAL_EVENT: if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hvcall_signal_event(vcpu, &hc); if (ret != HV_STATUS_INVALID_PORT_ID) break; fallthrough; /* maybe userspace knows this conn_id */ case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ if (unlikely(hc.rep || hc.var_cnt || !to_hv_synic(vcpu)->active)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } goto hypercall_userspace_exit; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } fallthrough; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: if (unlikely(!hc.rep_cnt || hc.rep_idx)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_flush_tlb(vcpu, &hc); break; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } fallthrough; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_flush_tlb(vcpu, &hc); break; case HVCALL_SEND_IPI: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } fallthrough; case HVCALL_SEND_IPI_EX: if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_send_ipi(vcpu, &hc); break; case HVCALL_POST_DEBUG_DATA: case HVCALL_RETRIEVE_DEBUG_DATA: if (unlikely(hc.fast)) { ret = HV_STATUS_INVALID_PARAMETER; break; } fallthrough; case HVCALL_RESET_DEBUG_SESSION: { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu)) { ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; } if (!(syndbg->options & HV_X64_SYNDBG_OPTION_USE_HCALLS)) { ret = HV_STATUS_OPERATION_DENIED; break; } goto hypercall_userspace_exit; } case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: if (unlikely(hc.fast)) { ret = HV_STATUS_INVALID_PARAMETER; break; } goto hypercall_userspace_exit; default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; } hypercall_complete: return kvm_hv_hypercall_complete(vcpu, ret); hypercall_userspace_exit: vcpu->run->exit_reason = KVM_EXIT_HYPERV; vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; vcpu->run->hyperv.u.hcall.input = hc.param; vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace; return 0; } void kvm_hv_init_vm(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); mutex_init(&hv->hv_lock); idr_init(&hv->conn_to_evt); } void kvm_hv_destroy_vm(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; int i; idr_for_each_entry(&hv->conn_to_evt, eventfd, i) eventfd_ctx_put(eventfd); idr_destroy(&hv->conn_to_evt); } static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) { struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; int ret; eventfd = eventfd_ctx_fdget(fd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); mutex_lock(&hv->hv_lock); ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1, GFP_KERNEL_ACCOUNT); mutex_unlock(&hv->hv_lock); if (ret >= 0) return 0; if (ret == -ENOSPC) ret = -EEXIST; eventfd_ctx_put(eventfd); return ret; } static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id) { struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; mutex_lock(&hv->hv_lock); eventfd = idr_remove(&hv->conn_to_evt, conn_id); mutex_unlock(&hv->hv_lock); if (!eventfd) return -ENOENT; synchronize_srcu(&kvm->srcu); eventfd_ctx_put(eventfd); return 0; } int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args) { if ((args->flags & ~KVM_HYPERV_EVENTFD_DEASSIGN) || (args->conn_id & ~KVM_HYPERV_CONN_ID_MASK)) return -EINVAL; if (args->flags == KVM_HYPERV_EVENTFD_DEASSIGN) return kvm_hv_eventfd_deassign(kvm, args->conn_id); return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd); } int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { uint16_t evmcs_ver = 0; struct kvm_cpuid_entry2 cpuid_entries[] = { { .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS }, { .function = HYPERV_CPUID_INTERFACE }, { .function = HYPERV_CPUID_VERSION }, { .function = HYPERV_CPUID_FEATURES }, { .function = HYPERV_CPUID_ENLIGHTMENT_INFO }, { .function = HYPERV_CPUID_IMPLEMENT_LIMITS }, { .function = HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS }, { .function = HYPERV_CPUID_SYNDBG_INTERFACE }, { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES }, { .function = HYPERV_CPUID_NESTED_FEATURES }, }; int i, nent = ARRAY_SIZE(cpuid_entries); if (kvm_x86_ops.nested_ops->get_evmcs_version) evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu); if (cpuid->nent < nent) return -E2BIG; if (cpuid->nent > nent) cpuid->nent = nent; for (i = 0; i < nent; i++) { struct kvm_cpuid_entry2 *ent = &cpuid_entries[i]; u32 signature[3]; switch (ent->function) { case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS: memcpy(signature, "Linux KVM Hv", 12); ent->eax = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES; ent->ebx = signature[0]; ent->ecx = signature[1]; ent->edx = signature[2]; break; case HYPERV_CPUID_INTERFACE: ent->eax = HYPERV_CPUID_SIGNATURE_EAX; break; case HYPERV_CPUID_VERSION: /* * We implement some Hyper-V 2016 functions so let's use * this version. */ ent->eax = 0x00003839; ent->ebx = 0x000A0000; break; case HYPERV_CPUID_FEATURES: ent->eax |= HV_MSR_VP_RUNTIME_AVAILABLE; ent->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE; ent->eax |= HV_MSR_SYNIC_AVAILABLE; ent->eax |= HV_MSR_SYNTIMER_AVAILABLE; ent->eax |= HV_MSR_APIC_ACCESS_AVAILABLE; ent->eax |= HV_MSR_HYPERCALL_AVAILABLE; ent->eax |= HV_MSR_VP_INDEX_AVAILABLE; ent->eax |= HV_MSR_RESET_AVAILABLE; ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; ent->eax |= HV_ACCESS_FREQUENCY_MSRS; ent->eax |= HV_ACCESS_REENLIGHTENMENT; ent->eax |= HV_ACCESS_TSC_INVARIANT; ent->ebx |= HV_POST_MESSAGES; ent->ebx |= HV_SIGNAL_EVENTS; ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS; ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; ent->ebx |= HV_DEBUGGING; ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE; ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_EXT_GVA_RANGES_FLUSH; /* * Direct Synthetic timers only make sense with in-kernel * LAPIC */ if (!vcpu || lapic_in_kernel(vcpu)) ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; break; case HYPERV_CPUID_ENLIGHTMENT_INFO: ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; if (!vcpu || lapic_in_kernel(vcpu)) ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; if (!cpu_smt_possible()) ent->eax |= HV_X64_NO_NONARCH_CORESHARING; ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED; /* * Default number of spinlock retry attempts, matches * HyperV 2016. */ ent->ebx = 0x00000FFF; break; case HYPERV_CPUID_IMPLEMENT_LIMITS: /* Maximum number of virtual processors */ ent->eax = KVM_MAX_VCPUS; /* * Maximum number of logical processors, matches * HyperV 2016. */ ent->ebx = 64; break; case HYPERV_CPUID_NESTED_FEATURES: ent->eax = evmcs_ver; ent->eax |= HV_X64_NESTED_DIRECT_FLUSH; ent->eax |= HV_X64_NESTED_MSR_BITMAP; ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL; break; case HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS: memcpy(signature, "Linux KVM Hv", 12); ent->eax = 0; ent->ebx = signature[0]; ent->ecx = signature[1]; ent->edx = signature[2]; break; case HYPERV_CPUID_SYNDBG_INTERFACE: memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12); ent->eax = signature[0]; break; case HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES: ent->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; break; default: break; } } if (copy_to_user(entries, cpuid_entries, nent * sizeof(struct kvm_cpuid_entry2))) return -EFAULT; return 0; }
30 29 30 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * Copyright 2016 Red Hat, Inc. and/or its affiliates. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/debugfs.h> #include "lapic.h" #include "mmu.h" #include "mmu/mmu_internal.h" static int vcpu_get_timer_advance_ns(void *data, u64 *val) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; *val = vcpu->arch.apic->lapic_timer.timer_advance_ns; return 0; } DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_advance_ns_fops, vcpu_get_timer_advance_ns, NULL, "%llu\n"); static int vcpu_get_guest_mode(void *data, u64 *val) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; *val = vcpu->stat.guest_mode; return 0; } DEFINE_SIMPLE_ATTRIBUTE(vcpu_guest_mode_fops, vcpu_get_guest_mode, NULL, "%lld\n"); static int vcpu_get_tsc_offset(void *data, u64 *val) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; *val = vcpu->arch.tsc_offset; return 0; } DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n"); static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; *val = vcpu->arch.tsc_scaling_ratio; return 0; } DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n"); static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) { *val = kvm_caps.tsc_scaling_ratio_frac_bits; return 0; } DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n"); void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry) { debugfs_create_file("guest_mode", 0444, debugfs_dentry, vcpu, &vcpu_guest_mode_fops); debugfs_create_file("tsc-offset", 0444, debugfs_dentry, vcpu, &vcpu_tsc_offset_fops); if (lapic_in_kernel(vcpu)) debugfs_create_file("lapic_timer_advance_ns", 0444, debugfs_dentry, vcpu, &vcpu_timer_advance_ns_fops); if (kvm_caps.has_tsc_control) { debugfs_create_file("tsc-scaling-ratio", 0444, debugfs_dentry, vcpu, &vcpu_tsc_scaling_fops); debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, debugfs_dentry, vcpu, &vcpu_tsc_scaling_frac_fops); } } /* * This covers statistics <1024 (11=log(1024)+1), which should be enough to * cover RMAP_RECYCLE_THRESHOLD. */ #define RMAP_LOG_SIZE 11 static const char *kvm_lpage_str[KVM_NR_PAGE_SIZES] = { "4K", "2M", "1G" }; static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) { struct kvm_rmap_head *rmap; struct kvm *kvm = m->private; struct kvm_memory_slot *slot; struct kvm_memslots *slots; unsigned int lpage_size, index; /* Still small enough to be on the stack */ unsigned int *log[KVM_NR_PAGE_SIZES], *cur; int i, j, k, l, ret; if (!kvm_memslots_have_rmaps(kvm)) return 0; ret = -ENOMEM; memset(log, 0, sizeof(log)); for (i = 0; i < KVM_NR_PAGE_SIZES; i++) { log[i] = kcalloc(RMAP_LOG_SIZE, sizeof(unsigned int), GFP_KERNEL); if (!log[i]) goto out; } mutex_lock(&kvm->slots_lock); write_lock(&kvm->mmu_lock); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { int bkt; slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(slot, bkt, slots) for (k = 0; k < KVM_NR_PAGE_SIZES; k++) { rmap = slot->arch.rmap[k]; lpage_size = kvm_mmu_slot_lpages(slot, k + 1); cur = log[k]; for (l = 0; l < lpage_size; l++) { index = ffs(pte_list_count(&rmap[l])); if (WARN_ON_ONCE(index >= RMAP_LOG_SIZE)) index = RMAP_LOG_SIZE - 1; cur[index]++; } } } write_unlock(&kvm->mmu_lock); mutex_unlock(&kvm->slots_lock); /* index=0 counts no rmap; index=1 counts 1 rmap */ seq_printf(m, "Rmap_Count:\t0\t1\t"); for (i = 2; i < RMAP_LOG_SIZE; i++) { j = 1 << (i - 1); k = (1 << i) - 1; seq_printf(m, "%d-%d\t", j, k); } seq_printf(m, "\n"); for (i = 0; i < KVM_NR_PAGE_SIZES; i++) { seq_printf(m, "Level=%s:\t", kvm_lpage_str[i]); cur = log[i]; for (j = 0; j < RMAP_LOG_SIZE; j++) seq_printf(m, "%d\t", cur[j]); seq_printf(m, "\n"); } ret = 0; out: for (i = 0; i < KVM_NR_PAGE_SIZES; i++) kfree(log[i]); return ret; } static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file) { struct kvm *kvm = inode->i_private; int r; if (!kvm_get_kvm_safe(kvm)) return -ENOENT; r = single_open(file, kvm_mmu_rmaps_stat_show, kvm); if (r < 0) kvm_put_kvm(kvm); return r; } static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file) { struct kvm *kvm = inode->i_private; kvm_put_kvm(kvm); return single_release(inode, file); } static const struct file_operations mmu_rmaps_stat_fops = { .owner = THIS_MODULE, .open = kvm_mmu_rmaps_stat_open, .read = seq_read, .llseek = seq_lseek, .release = kvm_mmu_rmaps_stat_release, }; void kvm_arch_create_vm_debugfs(struct kvm *kvm) { debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm, &mmu_rmaps_stat_fops); }
7 7 7 7 7 5 7 7 3 7 7 7 7 4 7 7 6 6 7 7 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include "debug.h" #include "ntfs_fs.h" // clang-format off /* Src buffer is zero. */ #define LZNT_ERROR_ALL_ZEROS 1 #define LZNT_CHUNK_SIZE 0x1000 // clang-format on struct lznt_hash { const u8 *p1; const u8 *p2; }; struct lznt { const u8 *unc; const u8 *unc_end; const u8 *best_match; size_t max_len; bool std; struct lznt_hash hash[LZNT_CHUNK_SIZE]; }; static inline size_t get_match_len(const u8 *ptr, const u8 *end, const u8 *prev, size_t max_len) { size_t len = 0; while (ptr + len < end && ptr[len] == prev[len] && ++len < max_len) ; return len; } static size_t longest_match_std(const u8 *src, struct lznt *ctx) { size_t hash_index; size_t len1 = 0, len2 = 0; const u8 **hash; hash_index = ((40543U * ((((src[0] << 4) ^ src[1]) << 4) ^ src[2])) >> 4) & (LZNT_CHUNK_SIZE - 1); hash = &(ctx->hash[hash_index].p1); if (hash[0] >= ctx->unc && hash[0] < src && hash[0][0] == src[0] && hash[0][1] == src[1] && hash[0][2] == src[2]) { len1 = 3; if (ctx->max_len > 3) len1 += get_match_len(src + 3, ctx->unc_end, hash[0] + 3, ctx->max_len - 3); } if (hash[1] >= ctx->unc && hash[1] < src && hash[1][0] == src[0] && hash[1][1] == src[1] && hash[1][2] == src[2]) { len2 = 3; if (ctx->max_len > 3) len2 += get_match_len(src + 3, ctx->unc_end, hash[1] + 3, ctx->max_len - 3); } /* Compare two matches and select the best one. */ if (len1 < len2) { ctx->best_match = hash[1]; len1 = len2; } else { ctx->best_match = hash[0]; } hash[1] = hash[0]; hash[0] = src; return len1; } static size_t longest_match_best(const u8 *src, struct lznt *ctx) { size_t max_len; const u8 *ptr; if (ctx->unc >= src || !ctx->max_len) return 0; max_len = 0; for (ptr = ctx->unc; ptr < src; ++ptr) { size_t len = get_match_len(src, ctx->unc_end, ptr, ctx->max_len); if (len >= max_len) { max_len = len; ctx->best_match = ptr; } } return max_len >= 3 ? max_len : 0; } static const size_t s_max_len[] = { 0x1002, 0x802, 0x402, 0x202, 0x102, 0x82, 0x42, 0x22, 0x12, }; static const size_t s_max_off[] = { 0x10, 0x20, 0x40, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, }; static inline u16 make_pair(size_t offset, size_t len, size_t index) { return ((offset - 1) << (12 - index)) | ((len - 3) & (((1 << (12 - index)) - 1))); } static inline size_t parse_pair(u16 pair, size_t *offset, size_t index) { *offset = 1 + (pair >> (12 - index)); return 3 + (pair & ((1 << (12 - index)) - 1)); } /* * compress_chunk * * Return: * * 0 - Ok, @cmpr contains @cmpr_chunk_size bytes of compressed data. * * 1 - Input buffer is full zero. * * -2 - The compressed buffer is too small to hold the compressed data. */ static inline int compress_chunk(size_t (*match)(const u8 *, struct lznt *), const u8 *unc, const u8 *unc_end, u8 *cmpr, u8 *cmpr_end, size_t *cmpr_chunk_size, struct lznt *ctx) { size_t cnt = 0; size_t idx = 0; const u8 *up = unc; u8 *cp = cmpr + 3; u8 *cp2 = cmpr + 2; u8 not_zero = 0; /* Control byte of 8-bit values: ( 0 - means byte as is, 1 - short pair ). */ u8 ohdr = 0; u8 *last; u16 t16; if (unc + LZNT_CHUNK_SIZE < unc_end) unc_end = unc + LZNT_CHUNK_SIZE; last = min(cmpr + LZNT_CHUNK_SIZE + sizeof(short), cmpr_end); ctx->unc = unc; ctx->unc_end = unc_end; ctx->max_len = s_max_len[0]; while (up < unc_end) { size_t max_len; while (unc + s_max_off[idx] < up) ctx->max_len = s_max_len[++idx]; /* Find match. */ max_len = up + 3 <= unc_end ? (*match)(up, ctx) : 0; if (!max_len) { if (cp >= last) goto NotCompressed; not_zero |= *cp++ = *up++; } else if (cp + 1 >= last) { goto NotCompressed; } else { t16 = make_pair(up - ctx->best_match, max_len, idx); *cp++ = t16; *cp++ = t16 >> 8; ohdr |= 1 << cnt; up += max_len; } cnt = (cnt + 1) & 7; if (!cnt) { *cp2 = ohdr; ohdr = 0; cp2 = cp; cp += 1; } } if (cp2 < last) *cp2 = ohdr; else cp -= 1; *cmpr_chunk_size = cp - cmpr; t16 = (*cmpr_chunk_size - 3) | 0xB000; cmpr[0] = t16; cmpr[1] = t16 >> 8; return not_zero ? 0 : LZNT_ERROR_ALL_ZEROS; NotCompressed: if ((cmpr + LZNT_CHUNK_SIZE + sizeof(short)) > last) return -2; /* * Copy non cmpr data. * 0x3FFF == ((LZNT_CHUNK_SIZE + 2 - 3) | 0x3000) */ cmpr[0] = 0xff; cmpr[1] = 0x3f; memcpy(cmpr + sizeof(short), unc, LZNT_CHUNK_SIZE); *cmpr_chunk_size = LZNT_CHUNK_SIZE + sizeof(short); return 0; } static inline ssize_t decompress_chunk(u8 *unc, u8 *unc_end, const u8 *cmpr, const u8 *cmpr_end) { u8 *up = unc; u8 ch = *cmpr++; size_t bit = 0; size_t index = 0; u16 pair; size_t offset, length; /* Do decompression until pointers are inside range. */ while (up < unc_end && cmpr < cmpr_end) { // return err if more than LZNT_CHUNK_SIZE bytes are written if (up - unc > LZNT_CHUNK_SIZE) return -EINVAL; /* Correct index */ while (unc + s_max_off[index] < up) index += 1; /* Check the current flag for zero. */ if (!(ch & (1 << bit))) { /* Just copy byte. */ *up++ = *cmpr++; goto next; } /* Check for boundary. */ if (cmpr + 1 >= cmpr_end) return -EINVAL; /* Read a short from little endian stream. */ pair = cmpr[1]; pair <<= 8; pair |= cmpr[0]; cmpr += 2; /* Translate packed information into offset and length. */ length = parse_pair(pair, &offset, index); /* Check offset for boundary. */ if (unc + offset > up) return -EINVAL; /* Truncate the length if necessary. */ if (up + length >= unc_end) length = unc_end - up; /* Now we copy bytes. This is the heart of LZ algorithm. */ for (; length > 0; length--, up++) *up = *(up - offset); next: /* Advance flag bit value. */ bit = (bit + 1) & 7; if (!bit) { if (cmpr >= cmpr_end) break; ch = *cmpr++; } } /* Return the size of uncompressed data. */ return up - unc; } /* * get_lznt_ctx * @level: 0 - Standard compression. * !0 - Best compression, requires a lot of cpu. */ struct lznt *get_lznt_ctx(int level) { struct lznt *r = kzalloc(level ? offsetof(struct lznt, hash) : sizeof(struct lznt), GFP_NOFS); if (r) r->std = !level; return r; } /* * compress_lznt - Compresses @unc into @cmpr * * Return: * * +x - Ok, @cmpr contains 'final_compressed_size' bytes of compressed data. * * 0 - Input buffer is full zero. */ size_t compress_lznt(const void *unc, size_t unc_size, void *cmpr, size_t cmpr_size, struct lznt *ctx) { int err; size_t (*match)(const u8 *src, struct lznt *ctx); u8 *p = cmpr; u8 *end = p + cmpr_size; const u8 *unc_chunk = unc; const u8 *unc_end = unc_chunk + unc_size; bool is_zero = true; if (ctx->std) { match = &longest_match_std; memset(ctx->hash, 0, sizeof(ctx->hash)); } else { match = &longest_match_best; } /* Compression cycle. */ for (; unc_chunk < unc_end; unc_chunk += LZNT_CHUNK_SIZE) { cmpr_size = 0; err = compress_chunk(match, unc_chunk, unc_end, p, end, &cmpr_size, ctx); if (err < 0) return unc_size; if (is_zero && err != LZNT_ERROR_ALL_ZEROS) is_zero = false; p += cmpr_size; } if (p <= end - 2) p[0] = p[1] = 0; return is_zero ? 0 : PtrOffset(cmpr, p); } /* * decompress_lznt - Decompress @cmpr into @unc. */ ssize_t decompress_lznt(const void *cmpr, size_t cmpr_size, void *unc, size_t unc_size) { const u8 *cmpr_chunk = cmpr; const u8 *cmpr_end = cmpr_chunk + cmpr_size; u8 *unc_chunk = unc; u8 *unc_end = unc_chunk + unc_size; u16 chunk_hdr; if (cmpr_size < sizeof(short)) return -EINVAL; /* Read chunk header. */ chunk_hdr = cmpr_chunk[1]; chunk_hdr <<= 8; chunk_hdr |= cmpr_chunk[0]; /* Loop through decompressing chunks. */ for (;;) { size_t chunk_size_saved; size_t unc_use; size_t cmpr_use = 3 + (chunk_hdr & (LZNT_CHUNK_SIZE - 1)); /* Check that the chunk actually fits the supplied buffer. */ if (cmpr_chunk + cmpr_use > cmpr_end) return -EINVAL; /* First make sure the chunk contains compressed data. */ if (chunk_hdr & 0x8000) { /* Decompress a chunk and return if we get an error. */ ssize_t err = decompress_chunk(unc_chunk, unc_end, cmpr_chunk + sizeof(chunk_hdr), cmpr_chunk + cmpr_use); if (err < 0) return err; unc_use = err; } else { /* This chunk does not contain compressed data. */ unc_use = unc_chunk + LZNT_CHUNK_SIZE > unc_end ? unc_end - unc_chunk : LZNT_CHUNK_SIZE; if (cmpr_chunk + sizeof(chunk_hdr) + unc_use > cmpr_end) { return -EINVAL; } memcpy(unc_chunk, cmpr_chunk + sizeof(chunk_hdr), unc_use); } /* Advance pointers. */ cmpr_chunk += cmpr_use; unc_chunk += unc_use; /* Check for the end of unc buffer. */ if (unc_chunk >= unc_end) break; /* Proceed the next chunk. */ if (cmpr_chunk > cmpr_end - 2) break; chunk_size_saved = LZNT_CHUNK_SIZE; /* Read chunk header. */ chunk_hdr = cmpr_chunk[1]; chunk_hdr <<= 8; chunk_hdr |= cmpr_chunk[0]; if (!chunk_hdr) break; /* Check the size of unc buffer. */ if (unc_use < chunk_size_saved) { size_t t1 = chunk_size_saved - unc_use; u8 *t2 = unc_chunk + t1; /* 'Zero' memory. */ if (t2 >= unc_end) break; memset(unc_chunk, 0, t1); unc_chunk = t2; } } /* Check compression boundary. */ if (cmpr_chunk > cmpr_end) return -EINVAL; /* * The unc size is just a difference between current * pointer and original one. */ return PtrOffset(unc, unc_chunk); }
32 32 11 44 44 44 44 44 44 31 32 32 99 46 82 82 98 99 77 78 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook */ #include "percpu_freelist.h" int pcpu_freelist_init(struct pcpu_freelist *s) { int cpu; s->freelist = alloc_percpu(struct pcpu_freelist_head); if (!s->freelist) return -ENOMEM; for_each_possible_cpu(cpu) { struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); raw_res_spin_lock_init(&head->lock); head->first = NULL; } return 0; } void pcpu_freelist_destroy(struct pcpu_freelist *s) { free_percpu(s->freelist); } static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { node->next = head->first; WRITE_ONCE(head->first, node); } static inline bool ___pcpu_freelist_push(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { if (raw_res_spin_lock(&head->lock)) return false; pcpu_freelist_push_node(head, node); raw_res_spin_unlock(&head->lock); return true; } void __pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { struct pcpu_freelist_head *head; int cpu; if (___pcpu_freelist_push(this_cpu_ptr(s->freelist), node)) return; while (true) { for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { if (cpu == raw_smp_processor_id()) continue; head = per_cpu_ptr(s->freelist, cpu); if (raw_res_spin_lock(&head->lock)) continue; pcpu_freelist_push_node(head, node); raw_res_spin_unlock(&head->lock); return; } } } void pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { unsigned long flags; local_irq_save(flags); __pcpu_freelist_push(s, node); local_irq_restore(flags); } void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, u32 nr_elems) { struct pcpu_freelist_head *head; unsigned int cpu, cpu_idx, i, j, n, m; n = nr_elems / num_possible_cpus(); m = nr_elems % num_possible_cpus(); cpu_idx = 0; for_each_possible_cpu(cpu) { head = per_cpu_ptr(s->freelist, cpu); j = n + (cpu_idx < m ? 1 : 0); for (i = 0; i < j; i++) { /* No locking required as this is not visible yet. */ pcpu_freelist_push_node(head, buf); buf += elem_size; } cpu_idx++; } } static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_node *node = NULL; struct pcpu_freelist_head *head; int cpu; for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) continue; if (raw_res_spin_lock(&head->lock)) continue; node = head->first; if (node) { WRITE_ONCE(head->first, node->next); raw_res_spin_unlock(&head->lock); return node; } raw_res_spin_unlock(&head->lock); } return node; } struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) { return ___pcpu_freelist_pop(s); } struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_node *ret; unsigned long flags; local_irq_save(flags); ret = __pcpu_freelist_pop(s); local_irq_restore(flags); return ret; }
4644 283 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 /* SPDX-License-Identifier: GPL-2.0 */ /* * Mutexes: blocking mutual exclusion locks * * started by Ingo Molnar: * * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * * This file contains the main data structure and API definitions. */ #ifndef __LINUX_MUTEX_H #define __LINUX_MUTEX_H #include <asm/current.h> #include <linux/list.h> #include <linux/spinlock_types.h> #include <linux/lockdep.h> #include <linux/atomic.h> #include <asm/processor.h> #include <linux/osq_lock.h> #include <linux/debug_locks.h> #include <linux/cleanup.h> #include <linux/mutex_types.h> struct device; #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ , .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_SLEEP, \ } #else # define __DEP_MAP_MUTEX_INITIALIZER(lockname) #endif #ifdef CONFIG_DEBUG_MUTEXES # define __DEBUG_MUTEX_INITIALIZER(lockname) \ , .magic = &lockname extern void mutex_destroy(struct mutex *lock); #else # define __DEBUG_MUTEX_INITIALIZER(lockname) static inline void mutex_destroy(struct mutex *lock) {} #endif /** * mutex_init - initialize the mutex * @mutex: the mutex to be initialized * * Initialize the mutex to unlocked state. * * It is not allowed to initialize an already locked mutex. */ #define mutex_init(mutex) \ do { \ static struct lock_class_key __key; \ \ __mutex_init((mutex), #mutex, &__key); \ } while (0) /** * mutex_init_with_key - initialize a mutex with a given lockdep key * @mutex: the mutex to be initialized * @key: the lockdep key to be associated with the mutex * * Initialize the mutex to the unlocked state. * * It is not allowed to initialize an already locked mutex. */ #define mutex_init_with_key(mutex, key) __mutex_init((mutex), #mutex, (key)) #ifndef CONFIG_PREEMPT_RT #define __MUTEX_INITIALIZER(lockname) \ { .owner = ATOMIC_LONG_INIT(0) \ , .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ , .wait_list = LIST_HEAD_INIT(lockname.wait_list) \ __DEBUG_MUTEX_INITIALIZER(lockname) \ __DEP_MAP_MUTEX_INITIALIZER(lockname) } #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) extern void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); /** * mutex_is_locked - is the mutex locked * @lock: the mutex to be queried * * Returns true if the mutex is locked, false if unlocked. */ extern bool mutex_is_locked(struct mutex *lock); #else /* !CONFIG_PREEMPT_RT */ /* * Preempt-RT variant based on rtmutexes. */ #define __MUTEX_INITIALIZER(mutexname) \ { \ .rtmutex = __RT_MUTEX_BASE_INITIALIZER(mutexname.rtmutex) \ __DEP_MAP_MUTEX_INITIALIZER(mutexname) \ } #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) extern void __mutex_rt_init(struct mutex *lock, const char *name, struct lock_class_key *key); #define mutex_is_locked(l) rt_mutex_base_is_locked(&(l)->rtmutex) #define __mutex_init(mutex, name, key) \ do { \ rt_mutex_base_init(&(mutex)->rtmutex); \ __mutex_rt_init((mutex), name, key); \ } while (0) #endif /* CONFIG_PREEMPT_RT */ #ifdef CONFIG_DEBUG_MUTEXES int __must_check __devm_mutex_init(struct device *dev, struct mutex *lock); #else static inline int __must_check __devm_mutex_init(struct device *dev, struct mutex *lock) { /* * When CONFIG_DEBUG_MUTEXES is off mutex_destroy() is just a nop so * no really need to register it in the devm subsystem. */ return 0; } #endif #define __mutex_init_ret(mutex) \ ({ \ typeof(mutex) mutex_ = (mutex); \ \ mutex_init(mutex_); \ mutex_; \ }) #define devm_mutex_init(dev, mutex) \ __devm_mutex_init(dev, __mutex_init_ret(mutex)) /* * See kernel/locking/mutex.c for detailed documentation of these APIs. * Also see Documentation/locking/mutex-design.rst. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass); extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass); extern int __must_check _mutex_lock_killable(struct mutex *lock, unsigned int subclass, struct lockdep_map *nest_lock); extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass); #define mutex_lock(lock) mutex_lock_nested(lock, 0) #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0) #define mutex_lock_killable(lock) _mutex_lock_killable(lock, 0, NULL) #define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0) #define mutex_lock_nest_lock(lock, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ } while (0) #define mutex_lock_killable_nest_lock(lock, nest_lock) \ ( \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map), \ _mutex_lock_killable(lock, 0, &(nest_lock)->dep_map) \ ) #define mutex_lock_killable_nested(lock, subclass) \ _mutex_lock_killable(lock, subclass, NULL) #else extern void mutex_lock(struct mutex *lock); extern int __must_check mutex_lock_interruptible(struct mutex *lock); extern int __must_check mutex_lock_killable(struct mutex *lock); extern void mutex_lock_io(struct mutex *lock); # define mutex_lock_nested(lock, subclass) mutex_lock(lock) # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock) # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock) # define mutex_lock_killable_nest_lock(lock, nest_lock) mutex_lock_killable(lock) # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) # define mutex_lock_io_nested(lock, subclass) mutex_lock_io(lock) #endif /* * NOTE: mutex_trylock() follows the spin_trylock() convention, * not the down_trylock() convention! * * Returns 1 if the mutex has been acquired successfully, and 0 on contention. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC extern int _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); #define mutex_trylock_nest_lock(lock, nest_lock) \ ( \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map), \ _mutex_trylock_nest_lock(lock, &(nest_lock)->dep_map) \ ) #define mutex_trylock(lock) _mutex_trylock_nest_lock(lock, NULL) #else extern int mutex_trylock(struct mutex *lock); #define mutex_trylock_nest_lock(lock, nest_lock) mutex_trylock(lock) #endif extern void mutex_unlock(struct mutex *lock); extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T)) DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T)) DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T), _RET == 0) extern unsigned long mutex_get_owner(struct mutex *lock); #endif /* __LINUX_MUTEX_H */
2 2 3 6 3 2 2 2 2 2 3 3 3 3 3 3 1 2 3 3 3 3 1 2 2 2 2 1 1 2 2 1 1 1 1 1 1 1 47 1 46 1 46 58 58 3 3 3 3 3 3 3 1 1 1 1 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 // SPDX-License-Identifier: GPL-2.0 /* * net/tipc/crypto.c: TIPC crypto for key handling & packet en/decryption * * Copyright (c) 2019, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <crypto/aead.h> #include <crypto/aes.h> #include <crypto/rng.h> #include "crypto.h" #include "msg.h" #include "bcast.h" #define TIPC_TX_GRACE_PERIOD msecs_to_jiffies(5000) /* 5s */ #define TIPC_TX_LASTING_TIME msecs_to_jiffies(10000) /* 10s */ #define TIPC_RX_ACTIVE_LIM msecs_to_jiffies(3000) /* 3s */ #define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(15000) /* 15s */ #define TIPC_MAX_TFMS_DEF 10 #define TIPC_MAX_TFMS_LIM 1000 #define TIPC_REKEYING_INTV_DEF (60 * 24) /* default: 1 day */ /* * TIPC Key ids */ enum { KEY_MASTER = 0, KEY_MIN = KEY_MASTER, KEY_1 = 1, KEY_2, KEY_3, KEY_MAX = KEY_3, }; /* * TIPC Crypto statistics */ enum { STAT_OK, STAT_NOK, STAT_ASYNC, STAT_ASYNC_OK, STAT_ASYNC_NOK, STAT_BADKEYS, /* tx only */ STAT_BADMSGS = STAT_BADKEYS, /* rx only */ STAT_NOKEYS, STAT_SWITCHES, MAX_STATS, }; /* TIPC crypto statistics' header */ static const char *hstats[MAX_STATS] = {"ok", "nok", "async", "async_ok", "async_nok", "badmsgs", "nokeys", "switches"}; /* Max TFMs number per key */ int sysctl_tipc_max_tfms __read_mostly = TIPC_MAX_TFMS_DEF; /* Key exchange switch, default: on */ int sysctl_tipc_key_exchange_enabled __read_mostly = 1; /* * struct tipc_key - TIPC keys' status indicator * * 7 6 5 4 3 2 1 0 * +-----+-----+-----+-----+-----+-----+-----+-----+ * key: | (reserved)|passive idx| active idx|pending idx| * +-----+-----+-----+-----+-----+-----+-----+-----+ */ struct tipc_key { #define KEY_BITS (2) #define KEY_MASK ((1 << KEY_BITS) - 1) union { struct { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 pending:2, active:2, passive:2, /* rx only */ reserved:2; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved:2, passive:2, /* rx only */ active:2, pending:2; #else #error "Please fix <asm/byteorder.h>" #endif } __packed; u8 keys; }; }; /** * struct tipc_tfm - TIPC TFM structure to form a list of TFMs * @tfm: cipher handle/key * @list: linked list of TFMs */ struct tipc_tfm { struct crypto_aead *tfm; struct list_head list; }; /** * struct tipc_aead - TIPC AEAD key structure * @tfm_entry: per-cpu pointer to one entry in TFM list * @crypto: TIPC crypto owns this key * @cloned: reference to the source key in case cloning * @users: the number of the key users (TX/RX) * @salt: the key's SALT value * @authsize: authentication tag size (max = 16) * @mode: crypto mode is applied to the key * @hint: a hint for user key * @rcu: struct rcu_head * @key: the aead key * @gen: the key's generation * @seqno: the key seqno (cluster scope) * @refcnt: the key reference counter */ struct tipc_aead { #define TIPC_AEAD_HINT_LEN (5) struct tipc_tfm * __percpu *tfm_entry; struct tipc_crypto *crypto; struct tipc_aead *cloned; atomic_t users; u32 salt; u8 authsize; u8 mode; char hint[2 * TIPC_AEAD_HINT_LEN + 1]; struct rcu_head rcu; struct tipc_aead_key *key; u16 gen; atomic64_t seqno ____cacheline_aligned; refcount_t refcnt ____cacheline_aligned; } ____cacheline_aligned; /** * struct tipc_crypto_stats - TIPC Crypto statistics * @stat: array of crypto statistics */ struct tipc_crypto_stats { unsigned int stat[MAX_STATS]; }; /** * struct tipc_crypto - TIPC TX/RX crypto structure * @net: struct net * @node: TIPC node (RX) * @aead: array of pointers to AEAD keys for encryption/decryption * @peer_rx_active: replicated peer RX active key index * @key_gen: TX/RX key generation * @key: the key states * @skey_mode: session key's mode * @skey: received session key * @wq: common workqueue on TX crypto * @work: delayed work sched for TX/RX * @key_distr: key distributing state * @rekeying_intv: rekeying interval (in minutes) * @stats: the crypto statistics * @name: the crypto name * @sndnxt: the per-peer sndnxt (TX) * @timer1: general timer 1 (jiffies) * @timer2: general timer 2 (jiffies) * @working: the crypto is working or not * @key_master: flag indicates if master key exists * @legacy_user: flag indicates if a peer joins w/o master key (for bwd comp.) * @nokey: no key indication * @flags: combined flags field * @lock: tipc_key lock */ struct tipc_crypto { struct net *net; struct tipc_node *node; struct tipc_aead __rcu *aead[KEY_MAX + 1]; atomic_t peer_rx_active; u16 key_gen; struct tipc_key key; u8 skey_mode; struct tipc_aead_key *skey; struct workqueue_struct *wq; struct delayed_work work; #define KEY_DISTR_SCHED 1 #define KEY_DISTR_COMPL 2 atomic_t key_distr; u32 rekeying_intv; struct tipc_crypto_stats __percpu *stats; char name[48]; atomic64_t sndnxt ____cacheline_aligned; unsigned long timer1; unsigned long timer2; union { struct { u8 working:1; u8 key_master:1; u8 legacy_user:1; u8 nokey: 1; }; u8 flags; }; spinlock_t lock; /* crypto lock */ } ____cacheline_aligned; /* struct tipc_crypto_tx_ctx - TX context for callbacks */ struct tipc_crypto_tx_ctx { struct tipc_aead *aead; struct tipc_bearer *bearer; struct tipc_media_addr dst; }; /* struct tipc_crypto_rx_ctx - RX context for callbacks */ struct tipc_crypto_rx_ctx { struct tipc_aead *aead; struct tipc_bearer *bearer; }; static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead); static inline void tipc_aead_put(struct tipc_aead *aead); static void tipc_aead_free(struct rcu_head *rp); static int tipc_aead_users(struct tipc_aead __rcu *aead); static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim); static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim); static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val); static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead); static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, u8 mode); static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src); static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, unsigned int crypto_ctx_size, u8 **iv, struct aead_request **req, struct scatterlist **sg, int nsg); static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode); static void tipc_aead_encrypt_done(void *data, int err); static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b); static void tipc_aead_decrypt_done(void *data, int err); static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr); static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, u8 tx_key, struct sk_buff *skb, struct tipc_crypto *__rx); static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, u8 new_passive, u8 new_active, u8 new_pending); static int tipc_crypto_key_attach(struct tipc_crypto *c, struct tipc_aead *aead, u8 pos, bool master_key); static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending); static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, struct tipc_crypto *rx, struct sk_buff *skb, u8 tx_key); static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb); static int tipc_crypto_key_revoke(struct net *net, u8 tx_key); static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode, u8 type); static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, struct tipc_bearer *b, struct sk_buff **skb, int err); static void tipc_crypto_do_cmd(struct net *net, int cmd); static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf); static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, char *buf); static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey, u16 gen, u8 mode, u32 dnode); static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr); static void tipc_crypto_work_tx(struct work_struct *work); static void tipc_crypto_work_rx(struct work_struct *work); static int tipc_aead_key_generate(struct tipc_aead_key *skey); #define is_tx(crypto) (!(crypto)->node) #define is_rx(crypto) (!is_tx(crypto)) #define key_next(cur) ((cur) % KEY_MAX + 1) #define tipc_aead_rcu_ptr(rcu_ptr, lock) \ rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock)) #define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ do { \ struct tipc_aead *__tmp = rcu_dereference_protected((rcu_ptr), \ lockdep_is_held(lock)); \ rcu_assign_pointer((rcu_ptr), (ptr)); \ tipc_aead_put(__tmp); \ } while (0) #define tipc_crypto_key_detach(rcu_ptr, lock) \ tipc_aead_rcu_replace((rcu_ptr), NULL, lock) /** * tipc_aead_key_validate - Validate a AEAD user key * @ukey: pointer to user key data * @info: netlink info pointer */ int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info) { int keylen; /* Check if algorithm exists */ if (unlikely(!crypto_has_alg(ukey->alg_name, 0, 0))) { GENL_SET_ERR_MSG(info, "unable to load the algorithm (module existed?)"); return -ENODEV; } /* Currently, we only support the "gcm(aes)" cipher algorithm */ if (strcmp(ukey->alg_name, "gcm(aes)")) { GENL_SET_ERR_MSG(info, "not supported yet the algorithm"); return -ENOTSUPP; } /* Check if key size is correct */ keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; if (unlikely(keylen != TIPC_AES_GCM_KEY_SIZE_128 && keylen != TIPC_AES_GCM_KEY_SIZE_192 && keylen != TIPC_AES_GCM_KEY_SIZE_256)) { GENL_SET_ERR_MSG(info, "incorrect key length (20, 28 or 36 octets?)"); return -EKEYREJECTED; } return 0; } /** * tipc_aead_key_generate - Generate new session key * @skey: input/output key with new content * * Return: 0 in case of success, otherwise < 0 */ static int tipc_aead_key_generate(struct tipc_aead_key *skey) { int rc = 0; /* Fill the key's content with a random value via RNG cipher */ rc = crypto_get_default_rng(); if (likely(!rc)) { rc = crypto_rng_get_bytes(crypto_default_rng, skey->key, skey->keylen); crypto_put_default_rng(); } return rc; } static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead) { struct tipc_aead *tmp; rcu_read_lock(); tmp = rcu_dereference(aead); if (unlikely(!tmp || !refcount_inc_not_zero(&tmp->refcnt))) tmp = NULL; rcu_read_unlock(); return tmp; } static inline void tipc_aead_put(struct tipc_aead *aead) { if (aead && refcount_dec_and_test(&aead->refcnt)) call_rcu(&aead->rcu, tipc_aead_free); } /** * tipc_aead_free - Release AEAD key incl. all the TFMs in the list * @rp: rcu head pointer */ static void tipc_aead_free(struct rcu_head *rp) { struct tipc_aead *aead = container_of(rp, struct tipc_aead, rcu); struct tipc_tfm *tfm_entry, *head, *tmp; if (aead->cloned) { tipc_aead_put(aead->cloned); } else { head = *get_cpu_ptr(aead->tfm_entry); put_cpu_ptr(aead->tfm_entry); list_for_each_entry_safe(tfm_entry, tmp, &head->list, list) { crypto_free_aead(tfm_entry->tfm); list_del(&tfm_entry->list); kfree(tfm_entry); } /* Free the head */ crypto_free_aead(head->tfm); list_del(&head->list); kfree(head); } free_percpu(aead->tfm_entry); kfree_sensitive(aead->key); kfree_sensitive(aead); } static int tipc_aead_users(struct tipc_aead __rcu *aead) { struct tipc_aead *tmp; int users = 0; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) users = atomic_read(&tmp->users); rcu_read_unlock(); return users; } static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim) { struct tipc_aead *tmp; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) atomic_add_unless(&tmp->users, 1, lim); rcu_read_unlock(); } static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim) { struct tipc_aead *tmp; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) atomic_add_unless(&rcu_dereference(aead)->users, -1, lim); rcu_read_unlock(); } static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val) { struct tipc_aead *tmp; int cur; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) { do { cur = atomic_read(&tmp->users); if (cur == val) break; } while (atomic_cmpxchg(&tmp->users, cur, val) != cur); } rcu_read_unlock(); } /** * tipc_aead_tfm_next - Move TFM entry to the next one in list and return it * @aead: the AEAD key pointer */ static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead) { struct tipc_tfm **tfm_entry; struct crypto_aead *tfm; tfm_entry = get_cpu_ptr(aead->tfm_entry); *tfm_entry = list_next_entry(*tfm_entry, list); tfm = (*tfm_entry)->tfm; put_cpu_ptr(tfm_entry); return tfm; } /** * tipc_aead_init - Initiate TIPC AEAD * @aead: returned new TIPC AEAD key handle pointer * @ukey: pointer to user key data * @mode: the key mode * * Allocate a (list of) new cipher transformation (TFM) with the specific user * key data if valid. The number of the allocated TFMs can be set via the sysfs * "net/tipc/max_tfms" first. * Also, all the other AEAD data are also initialized. * * Return: 0 if the initiation is successful, otherwise: < 0 */ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, u8 mode) { struct tipc_tfm *tfm_entry, *head; struct crypto_aead *tfm; struct tipc_aead *tmp; int keylen, err, cpu; int tfm_cnt = 0; if (unlikely(*aead)) return -EEXIST; /* Allocate a new AEAD */ tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); if (unlikely(!tmp)) return -ENOMEM; /* The key consists of two parts: [AES-KEY][SALT] */ keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; /* Allocate per-cpu TFM entry pointer */ tmp->tfm_entry = alloc_percpu(struct tipc_tfm *); if (!tmp->tfm_entry) { kfree_sensitive(tmp); return -ENOMEM; } /* Make a list of TFMs with the user key data */ do { tfm = crypto_alloc_aead(ukey->alg_name, 0, 0); if (IS_ERR(tfm)) { err = PTR_ERR(tfm); break; } if (unlikely(!tfm_cnt && crypto_aead_ivsize(tfm) != TIPC_AES_GCM_IV_SIZE)) { crypto_free_aead(tfm); err = -ENOTSUPP; break; } err = crypto_aead_setauthsize(tfm, TIPC_AES_GCM_TAG_SIZE); err |= crypto_aead_setkey(tfm, ukey->key, keylen); if (unlikely(err)) { crypto_free_aead(tfm); break; } tfm_entry = kmalloc(sizeof(*tfm_entry), GFP_KERNEL); if (unlikely(!tfm_entry)) { crypto_free_aead(tfm); err = -ENOMEM; break; } INIT_LIST_HEAD(&tfm_entry->list); tfm_entry->tfm = tfm; /* First entry? */ if (!tfm_cnt) { head = tfm_entry; for_each_possible_cpu(cpu) { *per_cpu_ptr(tmp->tfm_entry, cpu) = head; } } else { list_add_tail(&tfm_entry->list, &head->list); } } while (++tfm_cnt < sysctl_tipc_max_tfms); /* Not any TFM is allocated? */ if (!tfm_cnt) { free_percpu(tmp->tfm_entry); kfree_sensitive(tmp); return err; } /* Form a hex string of some last bytes as the key's hint */ bin2hex(tmp->hint, ukey->key + keylen - TIPC_AEAD_HINT_LEN, TIPC_AEAD_HINT_LEN); /* Initialize the other data */ tmp->mode = mode; tmp->cloned = NULL; tmp->authsize = TIPC_AES_GCM_TAG_SIZE; tmp->key = kmemdup(ukey, tipc_aead_key_size(ukey), GFP_KERNEL); if (!tmp->key) { tipc_aead_free(&tmp->rcu); return -ENOMEM; } memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE); atomic_set(&tmp->users, 0); atomic64_set(&tmp->seqno, 0); refcount_set(&tmp->refcnt, 1); *aead = tmp; return 0; } /** * tipc_aead_clone - Clone a TIPC AEAD key * @dst: dest key for the cloning * @src: source key to clone from * * Make a "copy" of the source AEAD key data to the dest, the TFMs list is * common for the keys. * A reference to the source is hold in the "cloned" pointer for the later * freeing purposes. * * Note: this must be done in cluster-key mode only! * Return: 0 in case of success, otherwise < 0 */ static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src) { struct tipc_aead *aead; int cpu; if (!src) return -ENOKEY; if (src->mode != CLUSTER_KEY) return -EINVAL; if (unlikely(*dst)) return -EEXIST; aead = kzalloc(sizeof(*aead), GFP_ATOMIC); if (unlikely(!aead)) return -ENOMEM; aead->tfm_entry = alloc_percpu_gfp(struct tipc_tfm *, GFP_ATOMIC); if (unlikely(!aead->tfm_entry)) { kfree_sensitive(aead); return -ENOMEM; } for_each_possible_cpu(cpu) { *per_cpu_ptr(aead->tfm_entry, cpu) = *per_cpu_ptr(src->tfm_entry, cpu); } memcpy(aead->hint, src->hint, sizeof(src->hint)); aead->mode = src->mode; aead->salt = src->salt; aead->authsize = src->authsize; atomic_set(&aead->users, 0); atomic64_set(&aead->seqno, 0); refcount_set(&aead->refcnt, 1); WARN_ON(!refcount_inc_not_zero(&src->refcnt)); aead->cloned = src; *dst = aead; return 0; } /** * tipc_aead_mem_alloc - Allocate memory for AEAD request operations * @tfm: cipher handle to be registered with the request * @crypto_ctx_size: size of crypto context for callback * @iv: returned pointer to IV data * @req: returned pointer to AEAD request data * @sg: returned pointer to SG lists * @nsg: number of SG lists to be allocated * * Allocate memory to store the crypto context data, AEAD request, IV and SG * lists, the memory layout is as follows: * crypto_ctx || iv || aead_req || sg[] * * Return: the pointer to the memory areas in case of success, otherwise NULL */ static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, unsigned int crypto_ctx_size, u8 **iv, struct aead_request **req, struct scatterlist **sg, int nsg) { unsigned int iv_size, req_size; unsigned int len; u8 *mem; iv_size = crypto_aead_ivsize(tfm); req_size = sizeof(**req) + crypto_aead_reqsize(tfm); len = crypto_ctx_size; len += iv_size; len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); len = ALIGN(len, crypto_tfm_ctx_alignment()); len += req_size; len = ALIGN(len, __alignof__(struct scatterlist)); len += nsg * sizeof(**sg); mem = kmalloc(len, GFP_ATOMIC); if (!mem) return NULL; *iv = (u8 *)PTR_ALIGN(mem + crypto_ctx_size, crypto_aead_alignmask(tfm) + 1); *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size, crypto_tfm_ctx_alignment()); *sg = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, __alignof__(struct scatterlist)); return (void *)mem; } /** * tipc_aead_encrypt - Encrypt a message * @aead: TIPC AEAD key for the message encryption * @skb: the input/output skb * @b: TIPC bearer where the message will be delivered after the encryption * @dst: the destination media address * @__dnode: TIPC dest node if "known" * * Return: * * 0 : if the encryption has completed * * -EINPROGRESS/-EBUSY : if a callback will be performed * * < 0 : the encryption has failed */ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode) { struct crypto_aead *tfm = tipc_aead_tfm_next(aead); struct tipc_crypto_tx_ctx *tx_ctx; struct aead_request *req; struct sk_buff *trailer; struct scatterlist *sg; struct tipc_ehdr *ehdr; int ehsz, len, tailen, nsg, rc; void *ctx; u32 salt; u8 *iv; /* Make sure message len at least 4-byte aligned */ len = ALIGN(skb->len, 4); tailen = len - skb->len + aead->authsize; /* Expand skb tail for authentication tag: * As for simplicity, we'd have made sure skb having enough tailroom * for authentication tag @skb allocation. Even when skb is nonlinear * but there is no frag_list, it should be still fine! * Otherwise, we must cow it to be a writable buffer with the tailroom. */ SKB_LINEAR_ASSERT(skb); if (tailen > skb_tailroom(skb)) { pr_debug("TX(): skb tailroom is not enough: %d, requires: %d\n", skb_tailroom(skb), tailen); } nsg = skb_cow_data(skb, tailen, &trailer); if (unlikely(nsg < 0)) { pr_err("TX: skb_cow_data() returned %d\n", nsg); return nsg; } pskb_put(skb, trailer, tailen); /* Allocate memory for the AEAD operation */ ctx = tipc_aead_mem_alloc(tfm, sizeof(*tx_ctx), &iv, &req, &sg, nsg); if (unlikely(!ctx)) return -ENOMEM; TIPC_SKB_CB(skb)->crypto_ctx = ctx; /* Map skb to the sg lists */ sg_init_table(sg, nsg); rc = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(rc < 0)) { pr_err("TX: skb_to_sgvec() returned %d, nsg %d!\n", rc, nsg); goto exit; } /* Prepare IV: [SALT (4 octets)][SEQNO (8 octets)] * In case we're in cluster-key mode, SALT is varied by xor-ing with * the source address (or w0 of id), otherwise with the dest address * if dest is known. */ ehdr = (struct tipc_ehdr *)skb->data; salt = aead->salt; if (aead->mode == CLUSTER_KEY) salt ^= __be32_to_cpu(ehdr->addr); else if (__dnode) salt ^= tipc_node_get_addr(__dnode); memcpy(iv, &salt, 4); memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); /* Prepare request */ ehsz = tipc_ehdr_size(ehdr); aead_request_set_tfm(req, tfm); aead_request_set_ad(req, ehsz); aead_request_set_crypt(req, sg, sg, len - ehsz, iv); /* Set callback function & data */ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, tipc_aead_encrypt_done, skb); tx_ctx = (struct tipc_crypto_tx_ctx *)ctx; tx_ctx->aead = aead; tx_ctx->bearer = b; memcpy(&tx_ctx->dst, dst, sizeof(*dst)); /* Hold bearer */ if (unlikely(!tipc_bearer_hold(b))) { rc = -ENODEV; goto exit; } /* Get net to avoid freed tipc_crypto when delete namespace */ if (!maybe_get_net(aead->crypto->net)) { tipc_bearer_put(b); rc = -ENODEV; goto exit; } /* Now, do encrypt */ rc = crypto_aead_encrypt(req); if (rc == -EINPROGRESS || rc == -EBUSY) return rc; tipc_bearer_put(b); put_net(aead->crypto->net); exit: kfree(ctx); TIPC_SKB_CB(skb)->crypto_ctx = NULL; return rc; } static void tipc_aead_encrypt_done(void *data, int err) { struct sk_buff *skb = data; struct tipc_crypto_tx_ctx *tx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; struct tipc_bearer *b = tx_ctx->bearer; struct tipc_aead *aead = tx_ctx->aead; struct tipc_crypto *tx = aead->crypto; struct net *net = tx->net; switch (err) { case 0: this_cpu_inc(tx->stats->stat[STAT_ASYNC_OK]); rcu_read_lock(); if (likely(test_bit(0, &b->up))) b->media->send_msg(net, skb, b, &tx_ctx->dst); else kfree_skb(skb); rcu_read_unlock(); break; case -EINPROGRESS: return; default: this_cpu_inc(tx->stats->stat[STAT_ASYNC_NOK]); kfree_skb(skb); break; } kfree(tx_ctx); tipc_bearer_put(b); tipc_aead_put(aead); put_net(net); } /** * tipc_aead_decrypt - Decrypt an encrypted message * @net: struct net * @aead: TIPC AEAD for the message decryption * @skb: the input/output skb * @b: TIPC bearer where the message has been received * * Return: * * 0 : if the decryption has completed * * -EINPROGRESS/-EBUSY : if a callback will be performed * * < 0 : the decryption has failed */ static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b) { struct tipc_crypto_rx_ctx *rx_ctx; struct aead_request *req; struct crypto_aead *tfm; struct sk_buff *unused; struct scatterlist *sg; struct tipc_ehdr *ehdr; int ehsz, nsg, rc; void *ctx; u32 salt; u8 *iv; if (unlikely(!aead)) return -ENOKEY; nsg = skb_cow_data(skb, 0, &unused); if (unlikely(nsg < 0)) { pr_err("RX: skb_cow_data() returned %d\n", nsg); return nsg; } /* Allocate memory for the AEAD operation */ tfm = tipc_aead_tfm_next(aead); ctx = tipc_aead_mem_alloc(tfm, sizeof(*rx_ctx), &iv, &req, &sg, nsg); if (unlikely(!ctx)) return -ENOMEM; TIPC_SKB_CB(skb)->crypto_ctx = ctx; /* Map skb to the sg lists */ sg_init_table(sg, nsg); rc = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(rc < 0)) { pr_err("RX: skb_to_sgvec() returned %d, nsg %d\n", rc, nsg); goto exit; } /* Reconstruct IV: */ ehdr = (struct tipc_ehdr *)skb->data; salt = aead->salt; if (aead->mode == CLUSTER_KEY) salt ^= __be32_to_cpu(ehdr->addr); else if (ehdr->destined) salt ^= tipc_own_addr(net); memcpy(iv, &salt, 4); memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); /* Prepare request */ ehsz = tipc_ehdr_size(ehdr); aead_request_set_tfm(req, tfm); aead_request_set_ad(req, ehsz); aead_request_set_crypt(req, sg, sg, skb->len - ehsz, iv); /* Set callback function & data */ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, tipc_aead_decrypt_done, skb); rx_ctx = (struct tipc_crypto_rx_ctx *)ctx; rx_ctx->aead = aead; rx_ctx->bearer = b; /* Hold bearer */ if (unlikely(!tipc_bearer_hold(b))) { rc = -ENODEV; goto exit; } /* Now, do decrypt */ rc = crypto_aead_decrypt(req); if (rc == -EINPROGRESS || rc == -EBUSY) return rc; tipc_bearer_put(b); exit: kfree(ctx); TIPC_SKB_CB(skb)->crypto_ctx = NULL; return rc; } static void tipc_aead_decrypt_done(void *data, int err) { struct sk_buff *skb = data; struct tipc_crypto_rx_ctx *rx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; struct tipc_bearer *b = rx_ctx->bearer; struct tipc_aead *aead = rx_ctx->aead; struct tipc_crypto_stats __percpu *stats = aead->crypto->stats; struct net *net = aead->crypto->net; switch (err) { case 0: this_cpu_inc(stats->stat[STAT_ASYNC_OK]); break; case -EINPROGRESS: return; default: this_cpu_inc(stats->stat[STAT_ASYNC_NOK]); break; } kfree(rx_ctx); tipc_crypto_rcv_complete(net, aead, b, &skb, err); if (likely(skb)) { if (likely(test_bit(0, &b->up))) tipc_rcv(net, skb, b); else kfree_skb(skb); } tipc_bearer_put(b); } static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr) { return (ehdr->user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; } /** * tipc_ehdr_validate - Validate an encryption message * @skb: the message buffer * * Return: "true" if this is a valid encryption message, otherwise "false" */ bool tipc_ehdr_validate(struct sk_buff *skb) { struct tipc_ehdr *ehdr; int ehsz; if (unlikely(!pskb_may_pull(skb, EHDR_MIN_SIZE))) return false; ehdr = (struct tipc_ehdr *)skb->data; if (unlikely(ehdr->version != TIPC_EVERSION)) return false; ehsz = tipc_ehdr_size(ehdr); if (unlikely(!pskb_may_pull(skb, ehsz))) return false; if (unlikely(skb->len <= ehsz + TIPC_AES_GCM_TAG_SIZE)) return false; return true; } /** * tipc_ehdr_build - Build TIPC encryption message header * @net: struct net * @aead: TX AEAD key to be used for the message encryption * @tx_key: key id used for the message encryption * @skb: input/output message skb * @__rx: RX crypto handle if dest is "known" * * Return: the header size if the building is successful, otherwise < 0 */ static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, u8 tx_key, struct sk_buff *skb, struct tipc_crypto *__rx) { struct tipc_msg *hdr = buf_msg(skb); struct tipc_ehdr *ehdr; u32 user = msg_user(hdr); u64 seqno; int ehsz; /* Make room for encryption header */ ehsz = (user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; WARN_ON(skb_headroom(skb) < ehsz); ehdr = (struct tipc_ehdr *)skb_push(skb, ehsz); /* Obtain a seqno first: * Use the key seqno (= cluster wise) if dest is unknown or we're in * cluster key mode, otherwise it's better for a per-peer seqno! */ if (!__rx || aead->mode == CLUSTER_KEY) seqno = atomic64_inc_return(&aead->seqno); else seqno = atomic64_inc_return(&__rx->sndnxt); /* Revoke the key if seqno is wrapped around */ if (unlikely(!seqno)) return tipc_crypto_key_revoke(net, tx_key); /* Word 1-2 */ ehdr->seqno = cpu_to_be64(seqno); /* Words 0, 3- */ ehdr->version = TIPC_EVERSION; ehdr->user = 0; ehdr->keepalive = 0; ehdr->tx_key = tx_key; ehdr->destined = (__rx) ? 1 : 0; ehdr->rx_key_active = (__rx) ? __rx->key.active : 0; ehdr->rx_nokey = (__rx) ? __rx->nokey : 0; ehdr->master_key = aead->crypto->key_master; ehdr->reserved_1 = 0; ehdr->reserved_2 = 0; switch (user) { case LINK_CONFIG: ehdr->user = LINK_CONFIG; memcpy(ehdr->id, tipc_own_id(net), NODE_ID_LEN); break; default: if (user == LINK_PROTOCOL && msg_type(hdr) == STATE_MSG) { ehdr->user = LINK_PROTOCOL; ehdr->keepalive = msg_is_keepalive(hdr); } ehdr->addr = hdr->hdr[3]; break; } return ehsz; } static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, u8 new_passive, u8 new_active, u8 new_pending) { struct tipc_key old = c->key; char buf[32]; c->key.keys = ((new_passive & KEY_MASK) << (KEY_BITS * 2)) | ((new_active & KEY_MASK) << (KEY_BITS)) | ((new_pending & KEY_MASK)); pr_debug("%s: key changing %s ::%pS\n", c->name, tipc_key_change_dump(old, c->key, buf), __builtin_return_address(0)); } /** * tipc_crypto_key_init - Initiate a new user / AEAD key * @c: TIPC crypto to which new key is attached * @ukey: the user key * @mode: the key mode (CLUSTER_KEY or PER_NODE_KEY) * @master_key: specify this is a cluster master key * * A new TIPC AEAD key will be allocated and initiated with the specified user * key, then attached to the TIPC crypto. * * Return: new key id in case of success, otherwise: < 0 */ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, u8 mode, bool master_key) { struct tipc_aead *aead = NULL; int rc = 0; /* Initiate with the new user key */ rc = tipc_aead_init(&aead, ukey, mode); /* Attach it to the crypto */ if (likely(!rc)) { rc = tipc_crypto_key_attach(c, aead, 0, master_key); if (rc < 0) tipc_aead_free(&aead->rcu); } return rc; } /** * tipc_crypto_key_attach - Attach a new AEAD key to TIPC crypto * @c: TIPC crypto to which the new AEAD key is attached * @aead: the new AEAD key pointer * @pos: desired slot in the crypto key array, = 0 if any! * @master_key: specify this is a cluster master key * * Return: new key id in case of success, otherwise: -EBUSY */ static int tipc_crypto_key_attach(struct tipc_crypto *c, struct tipc_aead *aead, u8 pos, bool master_key) { struct tipc_key key; int rc = -EBUSY; u8 new_key; spin_lock_bh(&c->lock); key = c->key; if (master_key) { new_key = KEY_MASTER; goto attach; } if (key.active && key.passive) goto exit; if (key.pending) { if (tipc_aead_users(c->aead[key.pending]) > 0) goto exit; /* if (pos): ok with replacing, will be aligned when needed */ /* Replace it */ new_key = key.pending; } else { if (pos) { if (key.active && pos != key_next(key.active)) { key.passive = pos; new_key = pos; goto attach; } else if (!key.active && !key.passive) { key.pending = pos; new_key = pos; goto attach; } } key.pending = key_next(key.active ?: key.passive); new_key = key.pending; } attach: aead->crypto = c; aead->gen = (is_tx(c)) ? ++c->key_gen : c->key_gen; tipc_aead_rcu_replace(c->aead[new_key], aead, &c->lock); if (likely(c->key.keys != key.keys)) tipc_crypto_key_set_state(c, key.passive, key.active, key.pending); c->working = 1; c->nokey = 0; c->key_master |= master_key; rc = new_key; exit: spin_unlock_bh(&c->lock); return rc; } void tipc_crypto_key_flush(struct tipc_crypto *c) { struct tipc_crypto *tx, *rx; int k; spin_lock_bh(&c->lock); if (is_rx(c)) { /* Try to cancel pending work */ rx = c; tx = tipc_net(rx->net)->crypto_tx; if (cancel_delayed_work(&rx->work)) { kfree(rx->skey); rx->skey = NULL; atomic_xchg(&rx->key_distr, 0); tipc_node_put(rx->node); } /* RX stopping => decrease TX key users if any */ k = atomic_xchg(&rx->peer_rx_active, 0); if (k) { tipc_aead_users_dec(tx->aead[k], 0); /* Mark the point TX key users changed */ tx->timer1 = jiffies; } } c->flags = 0; tipc_crypto_key_set_state(c, 0, 0, 0); for (k = KEY_MIN; k <= KEY_MAX; k++) tipc_crypto_key_detach(c->aead[k], &c->lock); atomic64_set(&c->sndnxt, 0); spin_unlock_bh(&c->lock); } /** * tipc_crypto_key_try_align - Align RX keys if possible * @rx: RX crypto handle * @new_pending: new pending slot if aligned (= TX key from peer) * * Peer has used an unknown key slot, this only happens when peer has left and * rejoned, or we are newcomer. * That means, there must be no active key but a pending key at unaligned slot. * If so, we try to move the pending key to the new slot. * Note: A potential passive key can exist, it will be shifted correspondingly! * * Return: "true" if key is successfully aligned, otherwise "false" */ static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending) { struct tipc_aead *tmp1, *tmp2 = NULL; struct tipc_key key; bool aligned = false; u8 new_passive = 0; int x; spin_lock(&rx->lock); key = rx->key; if (key.pending == new_pending) { aligned = true; goto exit; } if (key.active) goto exit; if (!key.pending) goto exit; if (tipc_aead_users(rx->aead[key.pending]) > 0) goto exit; /* Try to "isolate" this pending key first */ tmp1 = tipc_aead_rcu_ptr(rx->aead[key.pending], &rx->lock); if (!refcount_dec_if_one(&tmp1->refcnt)) goto exit; rcu_assign_pointer(rx->aead[key.pending], NULL); /* Move passive key if any */ if (key.passive) { tmp2 = rcu_replace_pointer(rx->aead[key.passive], tmp2, lockdep_is_held(&rx->lock)); x = (key.passive - key.pending + new_pending) % KEY_MAX; new_passive = (x <= 0) ? x + KEY_MAX : x; } /* Re-allocate the key(s) */ tipc_crypto_key_set_state(rx, new_passive, 0, new_pending); rcu_assign_pointer(rx->aead[new_pending], tmp1); if (new_passive) rcu_assign_pointer(rx->aead[new_passive], tmp2); refcount_set(&tmp1->refcnt, 1); aligned = true; pr_info_ratelimited("%s: key[%d] -> key[%d]\n", rx->name, key.pending, new_pending); exit: spin_unlock(&rx->lock); return aligned; } /** * tipc_crypto_key_pick_tx - Pick one TX key for message decryption * @tx: TX crypto handle * @rx: RX crypto handle (can be NULL) * @skb: the message skb which will be decrypted later * @tx_key: peer TX key id * * This function looks up the existing TX keys and pick one which is suitable * for the message decryption, that must be a cluster key and not used before * on the same message (i.e. recursive). * * Return: the TX AEAD key handle in case of success, otherwise NULL */ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, struct tipc_crypto *rx, struct sk_buff *skb, u8 tx_key) { struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(skb); struct tipc_aead *aead = NULL; struct tipc_key key = tx->key; u8 k, i = 0; /* Initialize data if not yet */ if (!skb_cb->tx_clone_deferred) { skb_cb->tx_clone_deferred = 1; memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); } skb_cb->tx_clone_ctx.rx = rx; if (++skb_cb->tx_clone_ctx.recurs > 2) return NULL; /* Pick one TX key */ spin_lock(&tx->lock); if (tx_key == KEY_MASTER) { aead = tipc_aead_rcu_ptr(tx->aead[KEY_MASTER], &tx->lock); goto done; } do { k = (i == 0) ? key.pending : ((i == 1) ? key.active : key.passive); if (!k) continue; aead = tipc_aead_rcu_ptr(tx->aead[k], &tx->lock); if (!aead) continue; if (aead->mode != CLUSTER_KEY || aead == skb_cb->tx_clone_ctx.last) { aead = NULL; continue; } /* Ok, found one cluster key */ skb_cb->tx_clone_ctx.last = aead; WARN_ON(skb->next); skb->next = skb_clone(skb, GFP_ATOMIC); if (unlikely(!skb->next)) pr_warn("Failed to clone skb for next round if any\n"); break; } while (++i < 3); done: if (likely(aead)) WARN_ON(!refcount_inc_not_zero(&aead->refcnt)); spin_unlock(&tx->lock); return aead; } /** * tipc_crypto_key_synch: Synch own key data according to peer key status * @rx: RX crypto handle * @skb: TIPCv2 message buffer (incl. the ehdr from peer) * * This function updates the peer node related data as the peer RX active key * has changed, so the number of TX keys' users on this node are increased and * decreased correspondingly. * * It also considers if peer has no key, then we need to make own master key * (if any) taking over i.e. starting grace period and also trigger key * distributing process. * * The "per-peer" sndnxt is also reset when the peer key has switched. */ static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb) { struct tipc_ehdr *ehdr = (struct tipc_ehdr *)skb_network_header(skb); struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; struct tipc_msg *hdr = buf_msg(skb); u32 self = tipc_own_addr(rx->net); u8 cur, new; unsigned long delay; /* Update RX 'key_master' flag according to peer, also mark "legacy" if * a peer has no master key. */ rx->key_master = ehdr->master_key; if (!rx->key_master) tx->legacy_user = 1; /* For later cases, apply only if message is destined to this node */ if (!ehdr->destined || msg_short(hdr) || msg_destnode(hdr) != self) return; /* Case 1: Peer has no keys, let's make master key take over */ if (ehdr->rx_nokey) { /* Set or extend grace period */ tx->timer2 = jiffies; /* Schedule key distributing for the peer if not yet */ if (tx->key.keys && !atomic_cmpxchg(&rx->key_distr, 0, KEY_DISTR_SCHED)) { get_random_bytes(&delay, 2); delay %= 5; delay = msecs_to_jiffies(500 * ++delay); if (queue_delayed_work(tx->wq, &rx->work, delay)) tipc_node_get(rx->node); } } else { /* Cancel a pending key distributing if any */ atomic_xchg(&rx->key_distr, 0); } /* Case 2: Peer RX active key has changed, let's update own TX users */ cur = atomic_read(&rx->peer_rx_active); new = ehdr->rx_key_active; if (tx->key.keys && cur != new && atomic_cmpxchg(&rx->peer_rx_active, cur, new) == cur) { if (new) tipc_aead_users_inc(tx->aead[new], INT_MAX); if (cur) tipc_aead_users_dec(tx->aead[cur], 0); atomic64_set(&rx->sndnxt, 0); /* Mark the point TX key users changed */ tx->timer1 = jiffies; pr_debug("%s: key users changed %d-- %d++, peer %s\n", tx->name, cur, new, rx->name); } } static int tipc_crypto_key_revoke(struct net *net, u8 tx_key) { struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_key key; spin_lock_bh(&tx->lock); key = tx->key; WARN_ON(!key.active || tx_key != key.active); /* Free the active key */ tipc_crypto_key_set_state(tx, key.passive, 0, key.pending); tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); spin_unlock_bh(&tx->lock); pr_warn("%s: key is revoked\n", tx->name); return -EKEYREVOKED; } int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, struct tipc_node *node) { struct tipc_crypto *c; if (*crypto) return -EEXIST; /* Allocate crypto */ c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) return -ENOMEM; /* Allocate workqueue on TX */ if (!node) { c->wq = alloc_ordered_workqueue("tipc_crypto", 0); if (!c->wq) { kfree(c); return -ENOMEM; } } /* Allocate statistic structure */ c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC); if (!c->stats) { if (c->wq) destroy_workqueue(c->wq); kfree_sensitive(c); return -ENOMEM; } c->flags = 0; c->net = net; c->node = node; get_random_bytes(&c->key_gen, 2); tipc_crypto_key_set_state(c, 0, 0, 0); atomic_set(&c->key_distr, 0); atomic_set(&c->peer_rx_active, 0); atomic64_set(&c->sndnxt, 0); c->timer1 = jiffies; c->timer2 = jiffies; c->rekeying_intv = TIPC_REKEYING_INTV_DEF; spin_lock_init(&c->lock); scnprintf(c->name, 48, "%s(%s)", (is_rx(c)) ? "RX" : "TX", (is_rx(c)) ? tipc_node_get_id_str(c->node) : tipc_own_id_string(c->net)); if (is_rx(c)) INIT_DELAYED_WORK(&c->work, tipc_crypto_work_rx); else INIT_DELAYED_WORK(&c->work, tipc_crypto_work_tx); *crypto = c; return 0; } void tipc_crypto_stop(struct tipc_crypto **crypto) { struct tipc_crypto *c = *crypto; u8 k; if (!c) return; /* Flush any queued works & destroy wq */ if (is_tx(c)) { c->rekeying_intv = 0; cancel_delayed_work_sync(&c->work); destroy_workqueue(c->wq); } /* Release AEAD keys */ rcu_read_lock(); for (k = KEY_MIN; k <= KEY_MAX; k++) tipc_aead_put(rcu_dereference(c->aead[k])); rcu_read_unlock(); pr_debug("%s: has been stopped\n", c->name); /* Free this crypto statistics */ free_percpu(c->stats); *crypto = NULL; kfree_sensitive(c); } void tipc_crypto_timeout(struct tipc_crypto *rx) { struct tipc_net *tn = tipc_net(rx->net); struct tipc_crypto *tx = tn->crypto_tx; struct tipc_key key; int cmd; /* TX pending: taking all users & stable -> active */ spin_lock(&tx->lock); key = tx->key; if (key.active && tipc_aead_users(tx->aead[key.active]) > 0) goto s1; if (!key.pending || tipc_aead_users(tx->aead[key.pending]) <= 0) goto s1; if (time_before(jiffies, tx->timer1 + TIPC_TX_LASTING_TIME)) goto s1; tipc_crypto_key_set_state(tx, key.passive, key.pending, 0); if (key.active) tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); this_cpu_inc(tx->stats->stat[STAT_SWITCHES]); pr_info("%s: key[%d] is activated\n", tx->name, key.pending); s1: spin_unlock(&tx->lock); /* RX pending: having user -> active */ spin_lock(&rx->lock); key = rx->key; if (!key.pending || tipc_aead_users(rx->aead[key.pending]) <= 0) goto s2; if (key.active) key.passive = key.active; key.active = key.pending; rx->timer2 = jiffies; tipc_crypto_key_set_state(rx, key.passive, key.active, 0); this_cpu_inc(rx->stats->stat[STAT_SWITCHES]); pr_info("%s: key[%d] is activated\n", rx->name, key.pending); goto s5; s2: /* RX pending: not working -> remove */ if (!key.pending || tipc_aead_users(rx->aead[key.pending]) > -10) goto s3; tipc_crypto_key_set_state(rx, key.passive, key.active, 0); tipc_crypto_key_detach(rx->aead[key.pending], &rx->lock); pr_debug("%s: key[%d] is removed\n", rx->name, key.pending); goto s5; s3: /* RX active: timed out or no user -> pending */ if (!key.active) goto s4; if (time_before(jiffies, rx->timer1 + TIPC_RX_ACTIVE_LIM) && tipc_aead_users(rx->aead[key.active]) > 0) goto s4; if (key.pending) key.passive = key.active; else key.pending = key.active; rx->timer2 = jiffies; tipc_crypto_key_set_state(rx, key.passive, 0, key.pending); tipc_aead_users_set(rx->aead[key.pending], 0); pr_debug("%s: key[%d] is deactivated\n", rx->name, key.active); goto s5; s4: /* RX passive: outdated or not working -> free */ if (!key.passive) goto s5; if (time_before(jiffies, rx->timer2 + TIPC_RX_PASSIVE_LIM) && tipc_aead_users(rx->aead[key.passive]) > -10) goto s5; tipc_crypto_key_set_state(rx, 0, key.active, key.pending); tipc_crypto_key_detach(rx->aead[key.passive], &rx->lock); pr_debug("%s: key[%d] is freed\n", rx->name, key.passive); s5: spin_unlock(&rx->lock); /* Relax it here, the flag will be set again if it really is, but only * when we are not in grace period for safety! */ if (time_after(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) tx->legacy_user = 0; /* Limit max_tfms & do debug commands if needed */ if (likely(sysctl_tipc_max_tfms <= TIPC_MAX_TFMS_LIM)) return; cmd = sysctl_tipc_max_tfms; sysctl_tipc_max_tfms = TIPC_MAX_TFMS_DEF; tipc_crypto_do_cmd(rx->net, cmd); } static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode, u8 type) { struct sk_buff *skb; skb = skb_clone(_skb, GFP_ATOMIC); if (skb) { TIPC_SKB_CB(skb)->xmit_type = type; tipc_crypto_xmit(net, &skb, b, dst, __dnode); if (skb) b->media->send_msg(net, skb, b, dst); } } /** * tipc_crypto_xmit - Build & encrypt TIPC message for xmit * @net: struct net * @skb: input/output message skb pointer * @b: bearer used for xmit later * @dst: destination media address * @__dnode: destination node for reference if any * * First, build an encryption message header on the top of the message, then * encrypt the original TIPC message by using the pending, master or active * key with this preference order. * If the encryption is successful, the encrypted skb is returned directly or * via the callback. * Otherwise, the skb is freed! * * Return: * * 0 : the encryption has succeeded (or no encryption) * * -EINPROGRESS/-EBUSY : the encryption is ongoing, a callback will be made * * -ENOKEK : the encryption has failed due to no key * * -EKEYREVOKED : the encryption has failed due to key revoked * * -ENOMEM : the encryption has failed due to no memory * * < 0 : the encryption has failed due to other reasons */ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode) { struct tipc_crypto *__rx = tipc_node_crypto_rx(__dnode); struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_crypto_stats __percpu *stats = tx->stats; struct tipc_msg *hdr = buf_msg(*skb); struct tipc_key key = tx->key; struct tipc_aead *aead = NULL; u32 user = msg_user(hdr); u32 type = msg_type(hdr); int rc = -ENOKEY; u8 tx_key = 0; /* No encryption? */ if (!tx->working) return 0; /* Pending key if peer has active on it or probing time */ if (unlikely(key.pending)) { tx_key = key.pending; if (!tx->key_master && !key.active) goto encrypt; if (__rx && atomic_read(&__rx->peer_rx_active) == tx_key) goto encrypt; if (TIPC_SKB_CB(*skb)->xmit_type == SKB_PROBING) { pr_debug("%s: probing for key[%d]\n", tx->name, key.pending); goto encrypt; } if (user == LINK_CONFIG || user == LINK_PROTOCOL) tipc_crypto_clone_msg(net, *skb, b, dst, __dnode, SKB_PROBING); } /* Master key if this is a *vital* message or in grace period */ if (tx->key_master) { tx_key = KEY_MASTER; if (!key.active) goto encrypt; if (TIPC_SKB_CB(*skb)->xmit_type == SKB_GRACING) { pr_debug("%s: gracing for msg (%d %d)\n", tx->name, user, type); goto encrypt; } if (user == LINK_CONFIG || (user == LINK_PROTOCOL && type == RESET_MSG) || (user == MSG_CRYPTO && type == KEY_DISTR_MSG) || time_before(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) { if (__rx && __rx->key_master && !atomic_read(&__rx->peer_rx_active)) goto encrypt; if (!__rx) { if (likely(!tx->legacy_user)) goto encrypt; tipc_crypto_clone_msg(net, *skb, b, dst, __dnode, SKB_GRACING); } } } /* Else, use the active key if any */ if (likely(key.active)) { tx_key = key.active; goto encrypt; } goto exit; encrypt: aead = tipc_aead_get(tx->aead[tx_key]); if (unlikely(!aead)) goto exit; rc = tipc_ehdr_build(net, aead, tx_key, *skb, __rx); if (likely(rc > 0)) rc = tipc_aead_encrypt(aead, *skb, b, dst, __dnode); exit: switch (rc) { case 0: this_cpu_inc(stats->stat[STAT_OK]); break; case -EINPROGRESS: case -EBUSY: this_cpu_inc(stats->stat[STAT_ASYNC]); *skb = NULL; return rc; default: this_cpu_inc(stats->stat[STAT_NOK]); if (rc == -ENOKEY) this_cpu_inc(stats->stat[STAT_NOKEYS]); else if (rc == -EKEYREVOKED) this_cpu_inc(stats->stat[STAT_BADKEYS]); kfree_skb(*skb); *skb = NULL; break; } tipc_aead_put(aead); return rc; } /** * tipc_crypto_rcv - Decrypt an encrypted TIPC message from peer * @net: struct net * @rx: RX crypto handle * @skb: input/output message skb pointer * @b: bearer where the message has been received * * If the decryption is successful, the decrypted skb is returned directly or * as the callback, the encryption header and auth tag will be trimed out * before forwarding to tipc_rcv() via the tipc_crypto_rcv_complete(). * Otherwise, the skb will be freed! * Note: RX key(s) can be re-aligned, or in case of no key suitable, TX * cluster key(s) can be taken for decryption (- recursive). * * Return: * * 0 : the decryption has successfully completed * * -EINPROGRESS/-EBUSY : the decryption is ongoing, a callback will be made * * -ENOKEY : the decryption has failed due to no key * * -EBADMSG : the decryption has failed due to bad message * * -ENOMEM : the decryption has failed due to no memory * * < 0 : the decryption has failed due to other reasons */ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, struct sk_buff **skb, struct tipc_bearer *b) { struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_crypto_stats __percpu *stats; struct tipc_aead *aead = NULL; struct tipc_key key; int rc = -ENOKEY; u8 tx_key, n; tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key; /* New peer? * Let's try with TX key (i.e. cluster mode) & verify the skb first! */ if (unlikely(!rx || tx_key == KEY_MASTER)) goto pick_tx; /* Pick RX key according to TX key if any */ key = rx->key; if (tx_key == key.active || tx_key == key.pending || tx_key == key.passive) goto decrypt; /* Unknown key, let's try to align RX key(s) */ if (tipc_crypto_key_try_align(rx, tx_key)) goto decrypt; pick_tx: /* No key suitable? Try to pick one from TX... */ aead = tipc_crypto_key_pick_tx(tx, rx, *skb, tx_key); if (aead) goto decrypt; goto exit; decrypt: rcu_read_lock(); if (!aead) aead = tipc_aead_get(rx->aead[tx_key]); rc = tipc_aead_decrypt(net, aead, *skb, b); rcu_read_unlock(); exit: stats = ((rx) ?: tx)->stats; switch (rc) { case 0: this_cpu_inc(stats->stat[STAT_OK]); break; case -EINPROGRESS: case -EBUSY: this_cpu_inc(stats->stat[STAT_ASYNC]); *skb = NULL; return rc; default: this_cpu_inc(stats->stat[STAT_NOK]); if (rc == -ENOKEY) { kfree_skb(*skb); *skb = NULL; if (rx) { /* Mark rx->nokey only if we dont have a * pending received session key, nor a newer * one i.e. in the next slot. */ n = key_next(tx_key); rx->nokey = !(rx->skey || rcu_access_pointer(rx->aead[n])); pr_debug_ratelimited("%s: nokey %d, key %d/%x\n", rx->name, rx->nokey, tx_key, rx->key.keys); tipc_node_put(rx->node); } this_cpu_inc(stats->stat[STAT_NOKEYS]); return rc; } else if (rc == -EBADMSG) { this_cpu_inc(stats->stat[STAT_BADMSGS]); } break; } tipc_crypto_rcv_complete(net, aead, b, skb, rc); return rc; } static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, struct tipc_bearer *b, struct sk_buff **skb, int err) { struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(*skb); struct tipc_crypto *rx = aead->crypto; struct tipc_aead *tmp = NULL; struct tipc_ehdr *ehdr; struct tipc_node *n; /* Is this completed by TX? */ if (unlikely(is_tx(aead->crypto))) { rx = skb_cb->tx_clone_ctx.rx; pr_debug("TX->RX(%s): err %d, aead %p, skb->next %p, flags %x\n", (rx) ? tipc_node_get_id_str(rx->node) : "-", err, aead, (*skb)->next, skb_cb->flags); pr_debug("skb_cb [recurs %d, last %p], tx->aead [%p %p %p]\n", skb_cb->tx_clone_ctx.recurs, skb_cb->tx_clone_ctx.last, aead->crypto->aead[1], aead->crypto->aead[2], aead->crypto->aead[3]); if (unlikely(err)) { if (err == -EBADMSG && (*skb)->next) tipc_rcv(net, (*skb)->next, b); goto free_skb; } if (likely((*skb)->next)) { kfree_skb((*skb)->next); (*skb)->next = NULL; } ehdr = (struct tipc_ehdr *)(*skb)->data; if (!rx) { WARN_ON(ehdr->user != LINK_CONFIG); n = tipc_node_create(net, 0, ehdr->id, 0xffffu, 0, true); rx = tipc_node_crypto_rx(n); if (unlikely(!rx)) goto free_skb; } /* Ignore cloning if it was TX master key */ if (ehdr->tx_key == KEY_MASTER) goto rcv; if (tipc_aead_clone(&tmp, aead) < 0) goto rcv; WARN_ON(!refcount_inc_not_zero(&tmp->refcnt)); if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key, false) < 0) { tipc_aead_free(&tmp->rcu); goto rcv; } tipc_aead_put(aead); aead = tmp; } if (unlikely(err)) { tipc_aead_users_dec((struct tipc_aead __force __rcu *)aead, INT_MIN); goto free_skb; } /* Set the RX key's user */ tipc_aead_users_set((struct tipc_aead __force __rcu *)aead, 1); /* Mark this point, RX works */ rx->timer1 = jiffies; rcv: /* Remove ehdr & auth. tag prior to tipc_rcv() */ ehdr = (struct tipc_ehdr *)(*skb)->data; /* Mark this point, RX passive still works */ if (rx->key.passive && ehdr->tx_key == rx->key.passive) rx->timer2 = jiffies; skb_reset_network_header(*skb); skb_pull(*skb, tipc_ehdr_size(ehdr)); if (pskb_trim(*skb, (*skb)->len - aead->authsize)) goto free_skb; /* Validate TIPCv2 message */ if (unlikely(!tipc_msg_validate(skb))) { pr_err_ratelimited("Packet dropped after decryption!\n"); goto free_skb; } /* Ok, everything's fine, try to synch own keys according to peers' */ tipc_crypto_key_synch(rx, *skb); /* Re-fetch skb cb as skb might be changed in tipc_msg_validate */ skb_cb = TIPC_SKB_CB(*skb); /* Mark skb decrypted */ skb_cb->decrypted = 1; /* Clear clone cxt if any */ if (likely(!skb_cb->tx_clone_deferred)) goto exit; skb_cb->tx_clone_deferred = 0; memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); goto exit; free_skb: kfree_skb(*skb); *skb = NULL; exit: tipc_aead_put(aead); if (rx) tipc_node_put(rx->node); } static void tipc_crypto_do_cmd(struct net *net, int cmd) { struct tipc_net *tn = tipc_net(net); struct tipc_crypto *tx = tn->crypto_tx, *rx; struct list_head *p; unsigned int stat; int i, j, cpu; char buf[200]; /* Currently only one command is supported */ switch (cmd) { case 0xfff1: goto print_stats; default: return; } print_stats: /* Print a header */ pr_info("\n=============== TIPC Crypto Statistics ===============\n\n"); /* Print key status */ pr_info("Key status:\n"); pr_info("TX(%7.7s)\n%s", tipc_own_id_string(net), tipc_crypto_key_dump(tx, buf)); rcu_read_lock(); for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { rx = tipc_node_crypto_rx_by_list(p); pr_info("RX(%7.7s)\n%s", tipc_node_get_id_str(rx->node), tipc_crypto_key_dump(rx, buf)); } rcu_read_unlock(); /* Print crypto statistics */ for (i = 0, j = 0; i < MAX_STATS; i++) j += scnprintf(buf + j, 200 - j, "|%11s ", hstats[i]); pr_info("Counter %s", buf); memset(buf, '-', 115); buf[115] = '\0'; pr_info("%s\n", buf); j = scnprintf(buf, 200, "TX(%7.7s) ", tipc_own_id_string(net)); for_each_possible_cpu(cpu) { for (i = 0; i < MAX_STATS; i++) { stat = per_cpu_ptr(tx->stats, cpu)->stat[i]; j += scnprintf(buf + j, 200 - j, "|%11d ", stat); } pr_info("%s", buf); j = scnprintf(buf, 200, "%12s", " "); } rcu_read_lock(); for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { rx = tipc_node_crypto_rx_by_list(p); j = scnprintf(buf, 200, "RX(%7.7s) ", tipc_node_get_id_str(rx->node)); for_each_possible_cpu(cpu) { for (i = 0; i < MAX_STATS; i++) { stat = per_cpu_ptr(rx->stats, cpu)->stat[i]; j += scnprintf(buf + j, 200 - j, "|%11d ", stat); } pr_info("%s", buf); j = scnprintf(buf, 200, "%12s", " "); } } rcu_read_unlock(); pr_info("\n======================== Done ========================\n"); } static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf) { struct tipc_key key = c->key; struct tipc_aead *aead; int k, i = 0; char *s; for (k = KEY_MIN; k <= KEY_MAX; k++) { if (k == KEY_MASTER) { if (is_rx(c)) continue; if (time_before(jiffies, c->timer2 + TIPC_TX_GRACE_PERIOD)) s = "ACT"; else s = "PAS"; } else { if (k == key.passive) s = "PAS"; else if (k == key.active) s = "ACT"; else if (k == key.pending) s = "PEN"; else s = "-"; } i += scnprintf(buf + i, 200 - i, "\tKey%d: %s", k, s); rcu_read_lock(); aead = rcu_dereference(c->aead[k]); if (aead) i += scnprintf(buf + i, 200 - i, "{\"0x...%s\", \"%s\"}/%d:%d", aead->hint, (aead->mode == CLUSTER_KEY) ? "c" : "p", atomic_read(&aead->users), refcount_read(&aead->refcnt)); rcu_read_unlock(); i += scnprintf(buf + i, 200 - i, "\n"); } if (is_rx(c)) i += scnprintf(buf + i, 200 - i, "\tPeer RX active: %d\n", atomic_read(&c->peer_rx_active)); return buf; } static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, char *buf) { struct tipc_key *key = &old; int k, i = 0; char *s; /* Output format: "[%s %s %s] -> [%s %s %s]", max len = 32 */ again: i += scnprintf(buf + i, 32 - i, "["); for (k = KEY_1; k <= KEY_3; k++) { if (k == key->passive) s = "pas"; else if (k == key->active) s = "act"; else if (k == key->pending) s = "pen"; else s = "-"; i += scnprintf(buf + i, 32 - i, (k != KEY_3) ? "%s " : "%s", s); } if (key != &new) { i += scnprintf(buf + i, 32 - i, "] -> "); key = &new; goto again; } i += scnprintf(buf + i, 32 - i, "]"); return buf; } /** * tipc_crypto_msg_rcv - Common 'MSG_CRYPTO' processing point * @net: the struct net * @skb: the receiving message buffer */ void tipc_crypto_msg_rcv(struct net *net, struct sk_buff *skb) { struct tipc_crypto *rx; struct tipc_msg *hdr; if (unlikely(skb_linearize(skb))) goto exit; hdr = buf_msg(skb); rx = tipc_node_crypto_rx_by_addr(net, msg_prevnode(hdr)); if (unlikely(!rx)) goto exit; switch (msg_type(hdr)) { case KEY_DISTR_MSG: if (tipc_crypto_key_rcv(rx, hdr)) goto exit; break; default: break; } tipc_node_put(rx->node); exit: kfree_skb(skb); } /** * tipc_crypto_key_distr - Distribute a TX key * @tx: the TX crypto * @key: the key's index * @dest: the destination tipc node, = NULL if distributing to all nodes * * Return: 0 in case of success, otherwise < 0 */ int tipc_crypto_key_distr(struct tipc_crypto *tx, u8 key, struct tipc_node *dest) { struct tipc_aead *aead; u32 dnode = tipc_node_get_addr(dest); int rc = -ENOKEY; if (!sysctl_tipc_key_exchange_enabled) return 0; if (key) { rcu_read_lock(); aead = tipc_aead_get(tx->aead[key]); if (likely(aead)) { rc = tipc_crypto_key_xmit(tx->net, aead->key, aead->gen, aead->mode, dnode); tipc_aead_put(aead); } rcu_read_unlock(); } return rc; } /** * tipc_crypto_key_xmit - Send a session key * @net: the struct net * @skey: the session key to be sent * @gen: the key's generation * @mode: the key's mode * @dnode: the destination node address, = 0 if broadcasting to all nodes * * The session key 'skey' is packed in a TIPC v2 'MSG_CRYPTO/KEY_DISTR_MSG' * as its data section, then xmit-ed through the uc/bc link. * * Return: 0 in case of success, otherwise < 0 */ static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey, u16 gen, u8 mode, u32 dnode) { struct sk_buff_head pkts; struct tipc_msg *hdr; struct sk_buff *skb; u16 size, cong_link_cnt; u8 *data; int rc; size = tipc_aead_key_size(skey); skb = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC); if (!skb) return -ENOMEM; hdr = buf_msg(skb); tipc_msg_init(tipc_own_addr(net), hdr, MSG_CRYPTO, KEY_DISTR_MSG, INT_H_SIZE, dnode); msg_set_size(hdr, INT_H_SIZE + size); msg_set_key_gen(hdr, gen); msg_set_key_mode(hdr, mode); data = msg_data(hdr); *((__be32 *)(data + TIPC_AEAD_ALG_NAME)) = htonl(skey->keylen); memcpy(data, skey->alg_name, TIPC_AEAD_ALG_NAME); memcpy(data + TIPC_AEAD_ALG_NAME + sizeof(__be32), skey->key, skey->keylen); __skb_queue_head_init(&pkts); __skb_queue_tail(&pkts, skb); if (dnode) rc = tipc_node_xmit(net, &pkts, dnode, 0); else rc = tipc_bcast_xmit(net, &pkts, &cong_link_cnt); return rc; } /** * tipc_crypto_key_rcv - Receive a session key * @rx: the RX crypto * @hdr: the TIPC v2 message incl. the receiving session key in its data * * This function retrieves the session key in the message from peer, then * schedules a RX work to attach the key to the corresponding RX crypto. * * Return: "true" if the key has been scheduled for attaching, otherwise * "false". */ static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr) { struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; struct tipc_aead_key *skey = NULL; u16 key_gen = msg_key_gen(hdr); u32 size = msg_data_sz(hdr); u8 *data = msg_data(hdr); unsigned int keylen; /* Verify whether the size can exist in the packet */ if (unlikely(size < sizeof(struct tipc_aead_key) + TIPC_AEAD_KEYLEN_MIN)) { pr_debug("%s: message data size is too small\n", rx->name); goto exit; } keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME))); /* Verify the supplied size values */ if (unlikely(keylen > TIPC_AEAD_KEY_SIZE_MAX || size != keylen + sizeof(struct tipc_aead_key))) { pr_debug("%s: invalid MSG_CRYPTO key size\n", rx->name); goto exit; } spin_lock(&rx->lock); if (unlikely(rx->skey || (key_gen == rx->key_gen && rx->key.keys))) { pr_err("%s: key existed <%p>, gen %d vs %d\n", rx->name, rx->skey, key_gen, rx->key_gen); goto exit_unlock; } /* Allocate memory for the key */ skey = kmalloc(size, GFP_ATOMIC); if (unlikely(!skey)) { pr_err("%s: unable to allocate memory for skey\n", rx->name); goto exit_unlock; } /* Copy key from msg data */ skey->keylen = keylen; memcpy(skey->alg_name, data, TIPC_AEAD_ALG_NAME); memcpy(skey->key, data + TIPC_AEAD_ALG_NAME + sizeof(__be32), skey->keylen); rx->key_gen = key_gen; rx->skey_mode = msg_key_mode(hdr); rx->skey = skey; rx->nokey = 0; mb(); /* for nokey flag */ exit_unlock: spin_unlock(&rx->lock); exit: /* Schedule the key attaching on this crypto */ if (likely(skey && queue_delayed_work(tx->wq, &rx->work, 0))) return true; return false; } /** * tipc_crypto_work_rx - Scheduled RX works handler * @work: the struct RX work * * The function processes the previous scheduled works i.e. distributing TX key * or attaching a received session key on RX crypto. */ static void tipc_crypto_work_rx(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct tipc_crypto *rx = container_of(dwork, struct tipc_crypto, work); struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; unsigned long delay = msecs_to_jiffies(5000); bool resched = false; u8 key; int rc; /* Case 1: Distribute TX key to peer if scheduled */ if (atomic_cmpxchg(&rx->key_distr, KEY_DISTR_SCHED, KEY_DISTR_COMPL) == KEY_DISTR_SCHED) { /* Always pick the newest one for distributing */ key = tx->key.pending ?: tx->key.active; rc = tipc_crypto_key_distr(tx, key, rx->node); if (unlikely(rc)) pr_warn("%s: unable to distr key[%d] to %s, err %d\n", tx->name, key, tipc_node_get_id_str(rx->node), rc); /* Sched for key_distr releasing */ resched = true; } else { atomic_cmpxchg(&rx->key_distr, KEY_DISTR_COMPL, 0); } /* Case 2: Attach a pending received session key from peer if any */ if (rx->skey) { rc = tipc_crypto_key_init(rx, rx->skey, rx->skey_mode, false); if (unlikely(rc < 0)) pr_warn("%s: unable to attach received skey, err %d\n", rx->name, rc); switch (rc) { case -EBUSY: case -ENOMEM: /* Resched the key attaching */ resched = true; break; default: synchronize_rcu(); kfree(rx->skey); rx->skey = NULL; break; } } if (resched && queue_delayed_work(tx->wq, &rx->work, delay)) return; tipc_node_put(rx->node); } /** * tipc_crypto_rekeying_sched - (Re)schedule rekeying w/o new interval * @tx: TX crypto * @changed: if the rekeying needs to be rescheduled with new interval * @new_intv: new rekeying interval (when "changed" = true) */ void tipc_crypto_rekeying_sched(struct tipc_crypto *tx, bool changed, u32 new_intv) { unsigned long delay; bool now = false; if (changed) { if (new_intv == TIPC_REKEYING_NOW) now = true; else tx->rekeying_intv = new_intv; cancel_delayed_work_sync(&tx->work); } if (tx->rekeying_intv || now) { delay = (now) ? 0 : tx->rekeying_intv * 60 * 1000; queue_delayed_work(tx->wq, &tx->work, msecs_to_jiffies(delay)); } } /** * tipc_crypto_work_tx - Scheduled TX works handler * @work: the struct TX work * * The function processes the previous scheduled work, i.e. key rekeying, by * generating a new session key based on current one, then attaching it to the * TX crypto and finally distributing it to peers. It also re-schedules the * rekeying if needed. */ static void tipc_crypto_work_tx(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct tipc_crypto *tx = container_of(dwork, struct tipc_crypto, work); struct tipc_aead_key *skey = NULL; struct tipc_key key = tx->key; struct tipc_aead *aead; int rc = -ENOMEM; if (unlikely(key.pending)) goto resched; /* Take current key as a template */ rcu_read_lock(); aead = rcu_dereference(tx->aead[key.active ?: KEY_MASTER]); if (unlikely(!aead)) { rcu_read_unlock(); /* At least one key should exist for securing */ return; } /* Lets duplicate it first */ skey = kmemdup(aead->key, tipc_aead_key_size(aead->key), GFP_ATOMIC); rcu_read_unlock(); /* Now, generate new key, initiate & distribute it */ if (likely(skey)) { rc = tipc_aead_key_generate(skey) ?: tipc_crypto_key_init(tx, skey, PER_NODE_KEY, false); if (likely(rc > 0)) rc = tipc_crypto_key_distr(tx, rc, NULL); kfree_sensitive(skey); } if (unlikely(rc)) pr_warn_ratelimited("%s: rekeying returns %d\n", tx->name, rc); resched: /* Re-schedule rekeying if any */ tipc_crypto_rekeying_sched(tx, false, 0); }
12 4 4 5 12 4 12 4 8 12 1 12 12 12 1 4 8 12 18 8 18 4 4 12 12 12 6 6 11 5 6 6 6 13 1 8 5 11 2 4 9 7 2 7 2 5 4 6 3 8 1 9 9 15 1 10 10 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 // SPDX-License-Identifier: GPL-2.0-only /* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF) * * Copyright (C) 2013 Terry Lam <vtlam@google.com> * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com> */ #include <linux/jiffies.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/siphash.h> #include <net/pkt_sched.h> #include <net/sock.h> /* Heavy-Hitter Filter (HHF) * * Principles : * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified * as heavy-hitter, it is immediately switched to the heavy-hitter bucket. * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler, * in which the heavy-hitter bucket is served with less weight. * In other words, non-heavy-hitters (e.g., short bursts of critical traffic) * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have * higher share of bandwidth. * * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the * following paper: * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and * Accounting", in ACM SIGCOMM, 2002. * * Conceptually, a multi-stage filter comprises k independent hash functions * and k counter arrays. Packets are indexed into k counter arrays by k hash * functions, respectively. The counters are then increased by the packet sizes. * Therefore, * - For a heavy-hitter flow: *all* of its k array counters must be large. * - For a non-heavy-hitter flow: some of its k array counters can be large * due to hash collision with other small flows; however, with high * probability, not *all* k counters are large. * * By the design of the multi-stage filter algorithm, the false negative rate * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is * susceptible to false positives (non-heavy-hitters mistakenly classified as * heavy-hitters). * Therefore, we also implement the following optimizations to reduce false * positives by avoiding unnecessary increment of the counter values: * - Optimization O1: once a heavy-hitter is identified, its bytes are not * accounted in the array counters. This technique is called "shielding" * in Section 3.3.1 of [EV02]. * - Optimization O2: conservative update of counters * (Section 3.3.2 of [EV02]), * New counter value = max {old counter value, * smallest counter value + packet bytes} * * Finally, we refresh the counters periodically since otherwise the counter * values will keep accumulating. * * Once a flow is classified as heavy-hitter, we also save its per-flow state * in an exact-matching flow table so that its subsequent packets can be * dispatched to the heavy-hitter bucket accordingly. * * * At a high level, this qdisc works as follows: * Given a packet p: * - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching * heavy-hitter flow table, denoted table T, then send p to the heavy-hitter * bucket. * - Otherwise, forward p to the multi-stage filter, denoted filter F * + If F decides that p belongs to a non-heavy-hitter flow, then send p * to the non-heavy-hitter bucket. * + Otherwise, if F decides that p belongs to a new heavy-hitter flow, * then set up a new flow entry for the flow-id of p in the table T and * send p to the heavy-hitter bucket. * * In this implementation: * - T is a fixed-size hash-table with 1024 entries. Hash collision is * resolved by linked-list chaining. * - F has four counter arrays, each array containing 1024 32-bit counters. * That means 4 * 1024 * 32 bits = 16KB of memory. * - Since each array in F contains 1024 counters, 10 bits are sufficient to * index into each array. * Hence, instead of having four hash functions, we chop the 32-bit * skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is * computed as XOR sum of those three chunks. * - We need to clear the counter arrays periodically; however, directly * memsetting 16KB of memory can lead to cache eviction and unwanted delay. * So by representing each counter by a valid bit, we only need to reset * 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory. * - The Deficit Round Robin engine is taken from fq_codel implementation * (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to * fq_codel_flow in fq_codel implementation. * */ /* Non-configurable parameters */ #define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */ #define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */ #define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */ #define HHF_BIT_MASK_LEN 10 /* masking 10 bits */ #define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */ #define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */ enum wdrr_bucket_idx { WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */ WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */ }; #define hhf_time_before(a, b) \ (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0)) /* Heavy-hitter per-flow state */ struct hh_flow_state { u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */ u32 hit_timestamp; /* last time heavy-hitter was seen */ struct list_head flowchain; /* chaining under hash collision */ }; /* Weighted Deficit Round Robin (WDRR) scheduler */ struct wdrr_bucket { struct sk_buff *head; struct sk_buff *tail; struct list_head bucketchain; int deficit; }; struct hhf_sched_data { struct wdrr_bucket buckets[WDRR_BUCKET_CNT]; siphash_key_t perturbation; /* hash perturbation */ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ u32 drop_overlimit; /* number of times max qdisc packet * limit was hit */ struct list_head *hh_flows; /* table T (currently active HHs) */ u32 hh_flows_limit; /* max active HH allocs */ u32 hh_flows_overlimit; /* num of disallowed HH allocs */ u32 hh_flows_total_cnt; /* total admitted HHs */ u32 hh_flows_current_cnt; /* total current HHs */ u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */ u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays * was reset */ unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits * of hhf_arrays */ /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */ struct list_head new_buckets; /* list of new buckets */ struct list_head old_buckets; /* list of old buckets */ /* Configurable HHF parameters */ u32 hhf_reset_timeout; /* interval to reset counter * arrays in filter F * (default 40ms) */ u32 hhf_admit_bytes; /* counter thresh to classify as * HH (default 128KB). * With these default values, * 128KB / 40ms = 25 Mbps * i.e., we expect to capture HHs * sending > 25 Mbps. */ u32 hhf_evict_timeout; /* aging threshold to evict idle * HHs out of table T. This should * be large enough to avoid * reordering during HH eviction. * (default 1s) */ u32 hhf_non_hh_weight; /* WDRR weight for non-HHs * (default 2, * i.e., non-HH : HH = 2 : 1) */ }; static u32 hhf_time_stamp(void) { return jiffies; } /* Looks up a heavy-hitter flow in a chaining list of table T. */ static struct hh_flow_state *seek_list(const u32 hash, struct list_head *head, struct hhf_sched_data *q) { struct hh_flow_state *flow, *next; u32 now = hhf_time_stamp(); if (list_empty(head)) return NULL; list_for_each_entry_safe(flow, next, head, flowchain) { u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; if (hhf_time_before(prev, now)) { /* Delete expired heavy-hitters, but preserve one entry * to avoid kzalloc() when next time this slot is hit. */ if (list_is_last(&flow->flowchain, head)) return NULL; list_del(&flow->flowchain); kfree(flow); q->hh_flows_current_cnt--; } else if (flow->hash_id == hash) { return flow; } } return NULL; } /* Returns a flow state entry for a new heavy-hitter. Either reuses an expired * entry or dynamically alloc a new entry. */ static struct hh_flow_state *alloc_new_hh(struct list_head *head, struct hhf_sched_data *q) { struct hh_flow_state *flow; u32 now = hhf_time_stamp(); if (!list_empty(head)) { /* Find an expired heavy-hitter flow entry. */ list_for_each_entry(flow, head, flowchain) { u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; if (hhf_time_before(prev, now)) return flow; } } if (q->hh_flows_current_cnt >= q->hh_flows_limit) { q->hh_flows_overlimit++; return NULL; } /* Create new entry. */ flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC); if (!flow) return NULL; q->hh_flows_current_cnt++; INIT_LIST_HEAD(&flow->flowchain); list_add_tail(&flow->flowchain, head); return flow; } /* Assigns packets to WDRR buckets. Implements a multi-stage filter to * classify heavy-hitters. */ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); u32 tmp_hash, hash; u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos; struct hh_flow_state *flow; u32 pkt_len, min_hhf_val; int i; u32 prev; u32 now = hhf_time_stamp(); /* Reset the HHF counter arrays if this is the right time. */ prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout; if (hhf_time_before(prev, now)) { for (i = 0; i < HHF_ARRAYS_CNT; i++) bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN); q->hhf_arrays_reset_timestamp = now; } /* Get hashed flow-id of the skb. */ hash = skb_get_hash_perturb(skb, &q->perturbation); /* Check if this packet belongs to an already established HH flow. */ flow_pos = hash & HHF_BIT_MASK; flow = seek_list(hash, &q->hh_flows[flow_pos], q); if (flow) { /* found its HH flow */ flow->hit_timestamp = now; return WDRR_BUCKET_FOR_HH; } /* Now pass the packet through the multi-stage filter. */ tmp_hash = hash; xorsum = 0; for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) { /* Split the skb_hash into three 10-bit chunks. */ filter_pos[i] = tmp_hash & HHF_BIT_MASK; xorsum ^= filter_pos[i]; tmp_hash >>= HHF_BIT_MASK_LEN; } /* The last chunk is computed as XOR sum of other chunks. */ filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash; pkt_len = qdisc_pkt_len(skb); min_hhf_val = ~0U; for (i = 0; i < HHF_ARRAYS_CNT; i++) { u32 val; if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) { q->hhf_arrays[i][filter_pos[i]] = 0; __set_bit(filter_pos[i], q->hhf_valid_bits[i]); } val = q->hhf_arrays[i][filter_pos[i]] + pkt_len; if (min_hhf_val > val) min_hhf_val = val; } /* Found a new HH iff all counter values > HH admit threshold. */ if (min_hhf_val > q->hhf_admit_bytes) { /* Just captured a new heavy-hitter. */ flow = alloc_new_hh(&q->hh_flows[flow_pos], q); if (!flow) /* memory alloc problem */ return WDRR_BUCKET_FOR_NON_HH; flow->hash_id = hash; flow->hit_timestamp = now; q->hh_flows_total_cnt++; /* By returning without updating counters in q->hhf_arrays, * we implicitly implement "shielding" (see Optimization O1). */ return WDRR_BUCKET_FOR_HH; } /* Conservative update of HHF arrays (see Optimization O2). */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val) q->hhf_arrays[i][filter_pos[i]] = min_hhf_val; } return WDRR_BUCKET_FOR_NON_HH; } /* Removes one skb from head of bucket. */ static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket) { struct sk_buff *skb = bucket->head; bucket->head = skb->next; skb_mark_not_on_list(skb); return skb; } /* Tail-adds skb to bucket. */ static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) { if (bucket->head == NULL) bucket->head = skb; else bucket->tail->next = skb; bucket->tail = skb; skb->next = NULL; } static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); struct wdrr_bucket *bucket; /* Always try to drop from heavy-hitters first. */ bucket = &q->buckets[WDRR_BUCKET_FOR_HH]; if (!bucket->head) bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH]; if (bucket->head) { struct sk_buff *skb = dequeue_head(bucket); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch, to_free); } /* Return id of the bucket from which the packet was dropped. */ return bucket - q->buckets; } static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); enum wdrr_bucket_idx idx; struct wdrr_bucket *bucket; unsigned int prev_backlog; idx = hhf_classify(skb, sch); bucket = &q->buckets[idx]; bucket_add(bucket, skb); qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&bucket->bucketchain)) { unsigned int weight; /* The logic of new_buckets vs. old_buckets is the same as * new_flows vs. old_flows in the implementation of fq_codel, * i.e., short bursts of non-HHs should have strict priority. */ if (idx == WDRR_BUCKET_FOR_HH) { /* Always move heavy-hitters to old bucket. */ weight = 1; list_add_tail(&bucket->bucketchain, &q->old_buckets); } else { weight = q->hhf_non_hh_weight; list_add_tail(&bucket->bucketchain, &q->new_buckets); } bucket->deficit = weight * q->quantum; } if (++sch->q.qlen <= sch->limit) return NET_XMIT_SUCCESS; prev_backlog = sch->qstats.backlog; q->drop_overlimit++; /* Return Congestion Notification only if we dropped a packet from this * bucket. */ if (hhf_drop(sch, to_free) == idx) return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this. */ qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog); return NET_XMIT_SUCCESS; } static struct sk_buff *hhf_dequeue(struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = NULL; struct wdrr_bucket *bucket; struct list_head *head; begin: head = &q->new_buckets; if (list_empty(head)) { head = &q->old_buckets; if (list_empty(head)) return NULL; } bucket = list_first_entry(head, struct wdrr_bucket, bucketchain); if (bucket->deficit <= 0) { int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ? 1 : q->hhf_non_hh_weight; bucket->deficit += weight * q->quantum; list_move_tail(&bucket->bucketchain, &q->old_buckets); goto begin; } if (bucket->head) { skb = dequeue_head(bucket); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); } if (!skb) { /* Force a pass through old_buckets to prevent starvation. */ if ((head == &q->new_buckets) && !list_empty(&q->old_buckets)) list_move_tail(&bucket->bucketchain, &q->old_buckets); else list_del_init(&bucket->bucketchain); goto begin; } qdisc_bstats_update(sch, skb); bucket->deficit -= qdisc_pkt_len(skb); return skb; } static void hhf_reset(struct Qdisc *sch) { struct sk_buff *skb; while ((skb = hhf_dequeue(sch)) != NULL) rtnl_kfree_skbs(skb, skb); } static void hhf_destroy(struct Qdisc *sch) { int i; struct hhf_sched_data *q = qdisc_priv(sch); for (i = 0; i < HHF_ARRAYS_CNT; i++) { kvfree(q->hhf_arrays[i]); kvfree(q->hhf_valid_bits[i]); } if (!q->hh_flows) return; for (i = 0; i < HH_FLOWS_CNT; i++) { struct hh_flow_state *flow, *next; struct list_head *head = &q->hh_flows[i]; if (list_empty(head)) continue; list_for_each_entry_safe(flow, next, head, flowchain) { list_del(&flow->flowchain); kfree(flow); } } kvfree(q->hh_flows); } static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { [TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 }, [TCA_HHF_QUANTUM] = { .type = NLA_U32 }, [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 }, [TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 }, [TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 }, [TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 }, [TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 }, }; static int hhf_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { unsigned int dropped_pkts = 0, dropped_bytes = 0; struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_HHF_MAX + 1]; int err; u64 non_hh_quantum; u32 new_quantum = q->quantum; u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight; err = nla_parse_nested_deprecated(tb, TCA_HHF_MAX, opt, hhf_policy, NULL); if (err < 0) return err; if (tb[TCA_HHF_QUANTUM]) new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]); if (tb[TCA_HHF_NON_HH_WEIGHT]) new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]); non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight; if (non_hh_quantum == 0 || non_hh_quantum > INT_MAX) return -EINVAL; sch_tree_lock(sch); if (tb[TCA_HHF_BACKLOG_LIMIT]) WRITE_ONCE(sch->limit, nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT])); WRITE_ONCE(q->quantum, new_quantum); WRITE_ONCE(q->hhf_non_hh_weight, new_hhf_non_hh_weight); if (tb[TCA_HHF_HH_FLOWS_LIMIT]) WRITE_ONCE(q->hh_flows_limit, nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT])); if (tb[TCA_HHF_RESET_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]); WRITE_ONCE(q->hhf_reset_timeout, usecs_to_jiffies(us)); } if (tb[TCA_HHF_ADMIT_BYTES]) WRITE_ONCE(q->hhf_admit_bytes, nla_get_u32(tb[TCA_HHF_ADMIT_BYTES])); if (tb[TCA_HHF_EVICT_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]); WRITE_ONCE(q->hhf_evict_timeout, usecs_to_jiffies(us)); } while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); if (!skb) break; dropped_pkts++; dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); } qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; } static int hhf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct hhf_sched_data *q = qdisc_priv(sch); int i; sch->limit = 1000; q->quantum = psched_mtu(qdisc_dev(sch)); get_random_bytes(&q->perturbation, sizeof(q->perturbation)); INIT_LIST_HEAD(&q->new_buckets); INIT_LIST_HEAD(&q->old_buckets); /* Configurable HHF parameters */ q->hhf_reset_timeout = HZ / 25; /* 40 ms */ q->hhf_admit_bytes = 131072; /* 128 KB */ q->hhf_evict_timeout = HZ; /* 1 sec */ q->hhf_non_hh_weight = 2; if (opt) { int err = hhf_change(sch, opt, extack); if (err) return err; } if (!q->hh_flows) { /* Initialize heavy-hitter flow table. */ q->hh_flows = kvcalloc(HH_FLOWS_CNT, sizeof(struct list_head), GFP_KERNEL); if (!q->hh_flows) return -ENOMEM; for (i = 0; i < HH_FLOWS_CNT; i++) INIT_LIST_HEAD(&q->hh_flows[i]); /* Cap max active HHs at twice len of hh_flows table. */ q->hh_flows_limit = 2 * HH_FLOWS_CNT; q->hh_flows_overlimit = 0; q->hh_flows_total_cnt = 0; q->hh_flows_current_cnt = 0; /* Initialize heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { q->hhf_arrays[i] = kvcalloc(HHF_ARRAYS_LEN, sizeof(u32), GFP_KERNEL); if (!q->hhf_arrays[i]) { /* Note: hhf_destroy() will be called * by our caller. */ return -ENOMEM; } } q->hhf_arrays_reset_timestamp = hhf_time_stamp(); /* Initialize valid bits of heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { q->hhf_valid_bits[i] = kvzalloc(HHF_ARRAYS_LEN / BITS_PER_BYTE, GFP_KERNEL); if (!q->hhf_valid_bits[i]) { /* Note: hhf_destroy() will be called * by our caller. */ return -ENOMEM; } } /* Initialize Weighted DRR buckets. */ for (i = 0; i < WDRR_BUCKET_CNT; i++) { struct wdrr_bucket *bucket = q->buckets + i; INIT_LIST_HEAD(&bucket->bucketchain); } } return 0; } static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) { struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_HHF_QUANTUM, READ_ONCE(q->quantum)) || nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, READ_ONCE(q->hh_flows_limit)) || nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT, jiffies_to_usecs(READ_ONCE(q->hhf_reset_timeout))) || nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, READ_ONCE(q->hhf_admit_bytes)) || nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT, jiffies_to_usecs(READ_ONCE(q->hhf_evict_timeout))) || nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, READ_ONCE(q->hhf_non_hh_weight))) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: return -1; } static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct hhf_sched_data *q = qdisc_priv(sch); struct tc_hhf_xstats st = { .drop_overlimit = q->drop_overlimit, .hh_overlimit = q->hh_flows_overlimit, .hh_tot_count = q->hh_flows_total_cnt, .hh_cur_count = q->hh_flows_current_cnt, }; return gnet_stats_copy_app(d, &st, sizeof(st)); } static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { .id = "hhf", .priv_size = sizeof(struct hhf_sched_data), .enqueue = hhf_enqueue, .dequeue = hhf_dequeue, .peek = qdisc_peek_dequeued, .init = hhf_init, .reset = hhf_reset, .destroy = hhf_destroy, .change = hhf_change, .dump = hhf_dump, .dump_stats = hhf_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("hhf"); static int __init hhf_module_init(void) { return register_qdisc(&hhf_qdisc_ops); } static void __exit hhf_module_exit(void) { unregister_qdisc(&hhf_qdisc_ops); } module_init(hhf_module_init) module_exit(hhf_module_exit) MODULE_AUTHOR("Terry Lam"); MODULE_AUTHOR("Nandita Dukkipati"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Heavy-Hitter Filter (HHF)");
1 1 1 1 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 // SPDX-License-Identifier: GPL-2.0-only /* * CAN driver for "8 devices" USB2CAN converter * * Copyright (C) 2012 Bernd Krumboeck (krumboeck@universalnet.at) * * This driver is inspired by the 3.2.0 version of drivers/net/can/usb/ems_usb.c * and drivers/net/can/usb/esd_usb2.c * * Many thanks to Gerhard Bertelsmann (info@gerhard-bertelsmann.de) * for testing and fixing this driver. Also many thanks to "8 devices", * who were very cooperative and answered my questions. */ #include <linux/ethtool.h> #include <linux/signal.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/usb.h> #include <linux/can.h> #include <linux/can/dev.h> #include <linux/can/error.h> /* driver constants */ #define MAX_RX_URBS 20 #define MAX_TX_URBS 20 #define RX_BUFFER_SIZE 64 /* vendor and product id */ #define USB_8DEV_VENDOR_ID 0x0483 #define USB_8DEV_PRODUCT_ID 0x1234 /* endpoints */ enum usb_8dev_endpoint { USB_8DEV_ENDP_DATA_RX = 1, USB_8DEV_ENDP_DATA_TX, USB_8DEV_ENDP_CMD_RX, USB_8DEV_ENDP_CMD_TX }; /* device CAN clock */ #define USB_8DEV_ABP_CLOCK 32000000 /* setup flags */ #define USB_8DEV_SILENT 0x01 #define USB_8DEV_LOOPBACK 0x02 #define USB_8DEV_DISABLE_AUTO_RESTRANS 0x04 #define USB_8DEV_STATUS_FRAME 0x08 /* commands */ enum usb_8dev_cmd { USB_8DEV_RESET = 1, USB_8DEV_OPEN, USB_8DEV_CLOSE, USB_8DEV_SET_SPEED, USB_8DEV_SET_MASK_FILTER, USB_8DEV_GET_STATUS, USB_8DEV_GET_STATISTICS, USB_8DEV_GET_SERIAL, USB_8DEV_GET_SOFTW_VER, USB_8DEV_GET_HARDW_VER, USB_8DEV_RESET_TIMESTAMP, USB_8DEV_GET_SOFTW_HARDW_VER }; /* command options */ #define USB_8DEV_BAUD_MANUAL 0x09 #define USB_8DEV_CMD_START 0x11 #define USB_8DEV_CMD_END 0x22 #define USB_8DEV_CMD_SUCCESS 0 #define USB_8DEV_CMD_ERROR 255 #define USB_8DEV_CMD_TIMEOUT 1000 /* frames */ #define USB_8DEV_DATA_START 0x55 #define USB_8DEV_DATA_END 0xAA #define USB_8DEV_TYPE_CAN_FRAME 0 #define USB_8DEV_TYPE_ERROR_FRAME 3 #define USB_8DEV_EXTID 0x01 #define USB_8DEV_RTR 0x02 #define USB_8DEV_ERR_FLAG 0x04 /* status */ #define USB_8DEV_STATUSMSG_OK 0x00 /* Normal condition. */ #define USB_8DEV_STATUSMSG_OVERRUN 0x01 /* Overrun occurred when sending */ #define USB_8DEV_STATUSMSG_BUSLIGHT 0x02 /* Error counter has reached 96 */ #define USB_8DEV_STATUSMSG_BUSHEAVY 0x03 /* Error count. has reached 128 */ #define USB_8DEV_STATUSMSG_BUSOFF 0x04 /* Device is in BUSOFF */ #define USB_8DEV_STATUSMSG_STUFF 0x20 /* Stuff Error */ #define USB_8DEV_STATUSMSG_FORM 0x21 /* Form Error */ #define USB_8DEV_STATUSMSG_ACK 0x23 /* Ack Error */ #define USB_8DEV_STATUSMSG_BIT0 0x24 /* Bit1 Error */ #define USB_8DEV_STATUSMSG_BIT1 0x25 /* Bit0 Error */ #define USB_8DEV_STATUSMSG_CRC 0x27 /* CRC Error */ #define USB_8DEV_RP_MASK 0x7F /* Mask for Receive Error Bit */ /* table of devices that work with this driver */ static const struct usb_device_id usb_8dev_table[] = { { USB_DEVICE(USB_8DEV_VENDOR_ID, USB_8DEV_PRODUCT_ID) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, usb_8dev_table); struct usb_8dev_tx_urb_context { struct usb_8dev_priv *priv; u32 echo_index; }; /* Structure to hold all of our device specific stuff */ struct usb_8dev_priv { struct can_priv can; /* must be the first member */ struct usb_device *udev; struct net_device *netdev; atomic_t active_tx_urbs; struct usb_anchor tx_submitted; struct usb_8dev_tx_urb_context tx_contexts[MAX_TX_URBS]; struct usb_anchor rx_submitted; struct can_berr_counter bec; u8 *cmd_msg_buffer; struct mutex usb_8dev_cmd_lock; void *rxbuf[MAX_RX_URBS]; dma_addr_t rxbuf_dma[MAX_RX_URBS]; }; /* tx frame */ struct __packed usb_8dev_tx_msg { u8 begin; u8 flags; /* RTR and EXT_ID flag */ __be32 id; /* upper 3 bits not used */ u8 dlc; /* data length code 0-8 bytes */ u8 data[8]; /* 64-bit data */ u8 end; }; /* rx frame */ struct __packed usb_8dev_rx_msg { u8 begin; u8 type; /* frame type */ u8 flags; /* RTR and EXT_ID flag */ __be32 id; /* upper 3 bits not used */ u8 dlc; /* data length code 0-8 bytes */ u8 data[8]; /* 64-bit data */ __be32 timestamp; /* 32-bit timestamp */ u8 end; }; /* command frame */ struct __packed usb_8dev_cmd_msg { u8 begin; u8 channel; /* unknown - always 0 */ u8 command; /* command to execute */ u8 opt1; /* optional parameter / return value */ u8 opt2; /* optional parameter 2 */ u8 data[10]; /* optional parameter and data */ u8 end; }; static int usb_8dev_send_cmd_msg(struct usb_8dev_priv *priv, u8 *msg, int size) { int actual_length; return usb_bulk_msg(priv->udev, usb_sndbulkpipe(priv->udev, USB_8DEV_ENDP_CMD_TX), msg, size, &actual_length, USB_8DEV_CMD_TIMEOUT); } static int usb_8dev_wait_cmd_msg(struct usb_8dev_priv *priv, u8 *msg, int size, int *actual_length) { return usb_bulk_msg(priv->udev, usb_rcvbulkpipe(priv->udev, USB_8DEV_ENDP_CMD_RX), msg, size, actual_length, USB_8DEV_CMD_TIMEOUT); } /* Send command to device and receive result. * Command was successful when opt1 = 0. */ static int usb_8dev_send_cmd(struct usb_8dev_priv *priv, struct usb_8dev_cmd_msg *out, struct usb_8dev_cmd_msg *in) { int err; int num_bytes_read; struct net_device *netdev; netdev = priv->netdev; out->begin = USB_8DEV_CMD_START; out->end = USB_8DEV_CMD_END; mutex_lock(&priv->usb_8dev_cmd_lock); memcpy(priv->cmd_msg_buffer, out, sizeof(struct usb_8dev_cmd_msg)); err = usb_8dev_send_cmd_msg(priv, priv->cmd_msg_buffer, sizeof(struct usb_8dev_cmd_msg)); if (err < 0) { netdev_err(netdev, "sending command message failed\n"); goto failed; } err = usb_8dev_wait_cmd_msg(priv, priv->cmd_msg_buffer, sizeof(struct usb_8dev_cmd_msg), &num_bytes_read); if (err < 0) { netdev_err(netdev, "no command message answer\n"); goto failed; } memcpy(in, priv->cmd_msg_buffer, sizeof(struct usb_8dev_cmd_msg)); if (in->begin != USB_8DEV_CMD_START || in->end != USB_8DEV_CMD_END || num_bytes_read != 16 || in->opt1 != 0) err = -EPROTO; failed: mutex_unlock(&priv->usb_8dev_cmd_lock); return err; } /* Send open command to device */ static int usb_8dev_cmd_open(struct usb_8dev_priv *priv) { struct can_bittiming *bt = &priv->can.bittiming; struct usb_8dev_cmd_msg outmsg; struct usb_8dev_cmd_msg inmsg; u32 ctrlmode = priv->can.ctrlmode; u32 flags = USB_8DEV_STATUS_FRAME; __be32 beflags; __be16 bebrp; memset(&outmsg, 0, sizeof(outmsg)); outmsg.command = USB_8DEV_OPEN; outmsg.opt1 = USB_8DEV_BAUD_MANUAL; outmsg.data[0] = bt->prop_seg + bt->phase_seg1; outmsg.data[1] = bt->phase_seg2; outmsg.data[2] = bt->sjw; /* BRP */ bebrp = cpu_to_be16((u16)bt->brp); memcpy(&outmsg.data[3], &bebrp, sizeof(bebrp)); /* flags */ if (ctrlmode & CAN_CTRLMODE_LOOPBACK) flags |= USB_8DEV_LOOPBACK; if (ctrlmode & CAN_CTRLMODE_LISTENONLY) flags |= USB_8DEV_SILENT; if (ctrlmode & CAN_CTRLMODE_ONE_SHOT) flags |= USB_8DEV_DISABLE_AUTO_RESTRANS; beflags = cpu_to_be32(flags); memcpy(&outmsg.data[5], &beflags, sizeof(beflags)); return usb_8dev_send_cmd(priv, &outmsg, &inmsg); } /* Send close command to device */ static int usb_8dev_cmd_close(struct usb_8dev_priv *priv) { struct usb_8dev_cmd_msg inmsg; struct usb_8dev_cmd_msg outmsg = { .channel = 0, .command = USB_8DEV_CLOSE, .opt1 = 0, .opt2 = 0 }; return usb_8dev_send_cmd(priv, &outmsg, &inmsg); } /* Get firmware and hardware version */ static int usb_8dev_cmd_version(struct usb_8dev_priv *priv, u32 *res) { struct usb_8dev_cmd_msg inmsg; struct usb_8dev_cmd_msg outmsg = { .channel = 0, .command = USB_8DEV_GET_SOFTW_HARDW_VER, .opt1 = 0, .opt2 = 0 }; int err = usb_8dev_send_cmd(priv, &outmsg, &inmsg); if (err) return err; *res = be32_to_cpup((__be32 *)inmsg.data); return err; } /* Set network device mode * * Maybe we should leave this function empty, because the device * set mode variable with open command. */ static int usb_8dev_set_mode(struct net_device *netdev, enum can_mode mode) { struct usb_8dev_priv *priv = netdev_priv(netdev); int err = 0; switch (mode) { case CAN_MODE_START: err = usb_8dev_cmd_open(priv); if (err) netdev_warn(netdev, "couldn't start device"); break; default: return -EOPNOTSUPP; } return err; } /* Read error/status frames */ static void usb_8dev_rx_err_msg(struct usb_8dev_priv *priv, struct usb_8dev_rx_msg *msg) { struct can_frame *cf; struct sk_buff *skb; struct net_device_stats *stats = &priv->netdev->stats; /* Error message: * byte 0: Status * byte 1: bit 7: Receive Passive * byte 1: bit 0-6: Receive Error Counter * byte 2: Transmit Error Counter * byte 3: Always 0 (maybe reserved for future use) */ u8 state = msg->data[0]; u8 rxerr = msg->data[1] & USB_8DEV_RP_MASK; u8 txerr = msg->data[2]; int rx_errors = 0; int tx_errors = 0; skb = alloc_can_err_skb(priv->netdev, &cf); if (!skb) return; switch (state) { case USB_8DEV_STATUSMSG_OK: priv->can.state = CAN_STATE_ERROR_ACTIVE; cf->can_id |= CAN_ERR_PROT; cf->data[2] = CAN_ERR_PROT_ACTIVE; break; case USB_8DEV_STATUSMSG_BUSOFF: priv->can.state = CAN_STATE_BUS_OFF; cf->can_id |= CAN_ERR_BUSOFF; priv->can.can_stats.bus_off++; can_bus_off(priv->netdev); break; case USB_8DEV_STATUSMSG_OVERRUN: case USB_8DEV_STATUSMSG_BUSLIGHT: case USB_8DEV_STATUSMSG_BUSHEAVY: cf->can_id |= CAN_ERR_CRTL; break; default: priv->can.state = CAN_STATE_ERROR_WARNING; cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR; priv->can.can_stats.bus_error++; break; } switch (state) { case USB_8DEV_STATUSMSG_OK: case USB_8DEV_STATUSMSG_BUSOFF: break; case USB_8DEV_STATUSMSG_ACK: cf->can_id |= CAN_ERR_ACK; tx_errors = 1; break; case USB_8DEV_STATUSMSG_CRC: cf->data[3] = CAN_ERR_PROT_LOC_CRC_SEQ; rx_errors = 1; break; case USB_8DEV_STATUSMSG_BIT0: cf->data[2] |= CAN_ERR_PROT_BIT0; tx_errors = 1; break; case USB_8DEV_STATUSMSG_BIT1: cf->data[2] |= CAN_ERR_PROT_BIT1; tx_errors = 1; break; case USB_8DEV_STATUSMSG_FORM: cf->data[2] |= CAN_ERR_PROT_FORM; rx_errors = 1; break; case USB_8DEV_STATUSMSG_STUFF: cf->data[2] |= CAN_ERR_PROT_STUFF; rx_errors = 1; break; case USB_8DEV_STATUSMSG_OVERRUN: cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW; stats->rx_over_errors++; rx_errors = 1; break; case USB_8DEV_STATUSMSG_BUSLIGHT: priv->can.state = CAN_STATE_ERROR_WARNING; cf->data[1] = (txerr > rxerr) ? CAN_ERR_CRTL_TX_WARNING : CAN_ERR_CRTL_RX_WARNING; priv->can.can_stats.error_warning++; break; case USB_8DEV_STATUSMSG_BUSHEAVY: priv->can.state = CAN_STATE_ERROR_PASSIVE; cf->data[1] = (txerr > rxerr) ? CAN_ERR_CRTL_TX_PASSIVE : CAN_ERR_CRTL_RX_PASSIVE; priv->can.can_stats.error_passive++; break; default: netdev_warn(priv->netdev, "Unknown status/error message (%d)\n", state); break; } if (tx_errors) { cf->data[2] |= CAN_ERR_PROT_TX; stats->tx_errors++; } if (rx_errors) stats->rx_errors++; if (priv->can.state != CAN_STATE_BUS_OFF) { cf->can_id |= CAN_ERR_CNT; cf->data[6] = txerr; cf->data[7] = rxerr; } priv->bec.txerr = txerr; priv->bec.rxerr = rxerr; netif_rx(skb); } /* Read data and status frames */ static void usb_8dev_rx_can_msg(struct usb_8dev_priv *priv, struct usb_8dev_rx_msg *msg) { struct can_frame *cf; struct sk_buff *skb; struct net_device_stats *stats = &priv->netdev->stats; if (msg->type == USB_8DEV_TYPE_ERROR_FRAME && msg->flags == USB_8DEV_ERR_FLAG) { usb_8dev_rx_err_msg(priv, msg); } else if (msg->type == USB_8DEV_TYPE_CAN_FRAME) { skb = alloc_can_skb(priv->netdev, &cf); if (!skb) return; cf->can_id = be32_to_cpu(msg->id); can_frame_set_cc_len(cf, msg->dlc & 0xF, priv->can.ctrlmode); if (msg->flags & USB_8DEV_EXTID) cf->can_id |= CAN_EFF_FLAG; if (msg->flags & USB_8DEV_RTR) { cf->can_id |= CAN_RTR_FLAG; } else { memcpy(cf->data, msg->data, cf->len); stats->rx_bytes += cf->len; } stats->rx_packets++; netif_rx(skb); } else { netdev_warn(priv->netdev, "frame type %d unknown", msg->type); } } /* Callback for reading data from device * * Check urb status, call read function and resubmit urb read operation. */ static void usb_8dev_read_bulk_callback(struct urb *urb) { struct usb_8dev_priv *priv = urb->context; struct net_device *netdev; int retval; int pos = 0; netdev = priv->netdev; if (!netif_device_present(netdev)) return; switch (urb->status) { case 0: /* success */ break; case -ENOENT: case -EPIPE: case -EPROTO: case -ESHUTDOWN: return; default: netdev_info(netdev, "Rx URB aborted (%d)\n", urb->status); goto resubmit_urb; } while (pos < urb->actual_length) { struct usb_8dev_rx_msg *msg; if (pos + sizeof(struct usb_8dev_rx_msg) > urb->actual_length) { netdev_err(priv->netdev, "format error\n"); break; } msg = (struct usb_8dev_rx_msg *)(urb->transfer_buffer + pos); usb_8dev_rx_can_msg(priv, msg); pos += sizeof(struct usb_8dev_rx_msg); } resubmit_urb: usb_fill_bulk_urb(urb, priv->udev, usb_rcvbulkpipe(priv->udev, USB_8DEV_ENDP_DATA_RX), urb->transfer_buffer, RX_BUFFER_SIZE, usb_8dev_read_bulk_callback, priv); retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval == -ENODEV) netif_device_detach(netdev); else if (retval) netdev_err(netdev, "failed resubmitting read bulk urb: %d\n", retval); } /* Callback handler for write operations * * Free allocated buffers, check transmit status and * calculate statistic. */ static void usb_8dev_write_bulk_callback(struct urb *urb) { struct usb_8dev_tx_urb_context *context = urb->context; struct usb_8dev_priv *priv; struct net_device *netdev; BUG_ON(!context); priv = context->priv; netdev = priv->netdev; /* free up our allocated buffer */ usb_free_coherent(urb->dev, urb->transfer_buffer_length, urb->transfer_buffer, urb->transfer_dma); atomic_dec(&priv->active_tx_urbs); if (!netif_device_present(netdev)) return; if (urb->status) netdev_info(netdev, "Tx URB aborted (%d)\n", urb->status); netdev->stats.tx_packets++; netdev->stats.tx_bytes += can_get_echo_skb(netdev, context->echo_index, NULL); /* Release context */ context->echo_index = MAX_TX_URBS; netif_wake_queue(netdev); } /* Send data to device */ static netdev_tx_t usb_8dev_start_xmit(struct sk_buff *skb, struct net_device *netdev) { struct usb_8dev_priv *priv = netdev_priv(netdev); struct net_device_stats *stats = &netdev->stats; struct can_frame *cf = (struct can_frame *) skb->data; struct usb_8dev_tx_msg *msg; struct urb *urb; struct usb_8dev_tx_urb_context *context = NULL; u8 *buf; int i, err; size_t size = sizeof(struct usb_8dev_tx_msg); if (can_dev_dropped_skb(netdev, skb)) return NETDEV_TX_OK; /* create a URB, and a buffer for it, and copy the data to the URB */ urb = usb_alloc_urb(0, GFP_ATOMIC); if (!urb) goto nomem; buf = usb_alloc_coherent(priv->udev, size, GFP_ATOMIC, &urb->transfer_dma); if (!buf) { netdev_err(netdev, "No memory left for USB buffer\n"); goto nomembuf; } memset(buf, 0, size); msg = (struct usb_8dev_tx_msg *)buf; msg->begin = USB_8DEV_DATA_START; msg->flags = 0x00; if (cf->can_id & CAN_RTR_FLAG) msg->flags |= USB_8DEV_RTR; if (cf->can_id & CAN_EFF_FLAG) msg->flags |= USB_8DEV_EXTID; msg->id = cpu_to_be32(cf->can_id & CAN_ERR_MASK); msg->dlc = can_get_cc_dlc(cf, priv->can.ctrlmode); memcpy(msg->data, cf->data, cf->len); msg->end = USB_8DEV_DATA_END; for (i = 0; i < MAX_TX_URBS; i++) { if (priv->tx_contexts[i].echo_index == MAX_TX_URBS) { context = &priv->tx_contexts[i]; break; } } /* May never happen! When this happens we'd more URBs in flight as * allowed (MAX_TX_URBS). */ if (!context) goto nofreecontext; context->priv = priv; context->echo_index = i; usb_fill_bulk_urb(urb, priv->udev, usb_sndbulkpipe(priv->udev, USB_8DEV_ENDP_DATA_TX), buf, size, usb_8dev_write_bulk_callback, context); urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; usb_anchor_urb(urb, &priv->tx_submitted); can_put_echo_skb(skb, netdev, context->echo_index, 0); atomic_inc(&priv->active_tx_urbs); err = usb_submit_urb(urb, GFP_ATOMIC); if (unlikely(err)) { can_free_echo_skb(netdev, context->echo_index, NULL); usb_unanchor_urb(urb); usb_free_coherent(priv->udev, size, buf, urb->transfer_dma); atomic_dec(&priv->active_tx_urbs); if (err == -ENODEV) netif_device_detach(netdev); else netdev_warn(netdev, "failed tx_urb %d\n", err); stats->tx_dropped++; } else if (atomic_read(&priv->active_tx_urbs) >= MAX_TX_URBS) /* Slow down tx path */ netif_stop_queue(netdev); /* Release our reference to this URB, the USB core will eventually free * it entirely. */ usb_free_urb(urb); return NETDEV_TX_OK; nofreecontext: usb_free_coherent(priv->udev, size, buf, urb->transfer_dma); usb_free_urb(urb); netdev_warn(netdev, "couldn't find free context"); return NETDEV_TX_BUSY; nomembuf: usb_free_urb(urb); nomem: dev_kfree_skb(skb); stats->tx_dropped++; return NETDEV_TX_OK; } static int usb_8dev_get_berr_counter(const struct net_device *netdev, struct can_berr_counter *bec) { struct usb_8dev_priv *priv = netdev_priv(netdev); bec->txerr = priv->bec.txerr; bec->rxerr = priv->bec.rxerr; return 0; } /* Start USB device */ static int usb_8dev_start(struct usb_8dev_priv *priv) { struct net_device *netdev = priv->netdev; int err, i; for (i = 0; i < MAX_RX_URBS; i++) { struct urb *urb = NULL; u8 *buf; dma_addr_t buf_dma; /* create a URB, and a buffer for it */ urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) { err = -ENOMEM; break; } buf = usb_alloc_coherent(priv->udev, RX_BUFFER_SIZE, GFP_KERNEL, &buf_dma); if (!buf) { netdev_err(netdev, "No memory left for USB buffer\n"); usb_free_urb(urb); err = -ENOMEM; break; } urb->transfer_dma = buf_dma; usb_fill_bulk_urb(urb, priv->udev, usb_rcvbulkpipe(priv->udev, USB_8DEV_ENDP_DATA_RX), buf, RX_BUFFER_SIZE, usb_8dev_read_bulk_callback, priv); urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; usb_anchor_urb(urb, &priv->rx_submitted); err = usb_submit_urb(urb, GFP_KERNEL); if (err) { usb_unanchor_urb(urb); usb_free_coherent(priv->udev, RX_BUFFER_SIZE, buf, urb->transfer_dma); usb_free_urb(urb); break; } priv->rxbuf[i] = buf; priv->rxbuf_dma[i] = buf_dma; /* Drop reference, USB core will take care of freeing it */ usb_free_urb(urb); } /* Did we submit any URBs */ if (i == 0) { netdev_warn(netdev, "couldn't setup read URBs\n"); return err; } /* Warn if we've couldn't transmit all the URBs */ if (i < MAX_RX_URBS) netdev_warn(netdev, "rx performance may be slow\n"); err = usb_8dev_cmd_open(priv); if (err) goto failed; priv->can.state = CAN_STATE_ERROR_ACTIVE; return 0; failed: if (err == -ENODEV) netif_device_detach(priv->netdev); netdev_warn(netdev, "couldn't submit control: %d\n", err); return err; } /* Open USB device */ static int usb_8dev_open(struct net_device *netdev) { struct usb_8dev_priv *priv = netdev_priv(netdev); int err; /* common open */ err = open_candev(netdev); if (err) return err; /* finally start device */ err = usb_8dev_start(priv); if (err) { if (err == -ENODEV) netif_device_detach(priv->netdev); netdev_warn(netdev, "couldn't start device: %d\n", err); close_candev(netdev); return err; } netif_start_queue(netdev); return 0; } static void unlink_all_urbs(struct usb_8dev_priv *priv) { int i; usb_kill_anchored_urbs(&priv->rx_submitted); for (i = 0; i < MAX_RX_URBS; ++i) usb_free_coherent(priv->udev, RX_BUFFER_SIZE, priv->rxbuf[i], priv->rxbuf_dma[i]); usb_kill_anchored_urbs(&priv->tx_submitted); atomic_set(&priv->active_tx_urbs, 0); for (i = 0; i < MAX_TX_URBS; i++) priv->tx_contexts[i].echo_index = MAX_TX_URBS; } /* Close USB device */ static int usb_8dev_close(struct net_device *netdev) { struct usb_8dev_priv *priv = netdev_priv(netdev); int err = 0; /* Send CLOSE command to CAN controller */ err = usb_8dev_cmd_close(priv); if (err) netdev_warn(netdev, "couldn't stop device"); priv->can.state = CAN_STATE_STOPPED; netif_stop_queue(netdev); /* Stop polling */ unlink_all_urbs(priv); close_candev(netdev); return err; } static const struct net_device_ops usb_8dev_netdev_ops = { .ndo_open = usb_8dev_open, .ndo_stop = usb_8dev_close, .ndo_start_xmit = usb_8dev_start_xmit, .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops usb_8dev_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; static const struct can_bittiming_const usb_8dev_bittiming_const = { .name = KBUILD_MODNAME, .tseg1_min = 1, .tseg1_max = 16, .tseg2_min = 1, .tseg2_max = 8, .sjw_max = 4, .brp_min = 1, .brp_max = 1024, .brp_inc = 1, }; /* Probe USB device * * Check device and firmware. * Set supported modes and bittiming constants. * Allocate some memory. */ static int usb_8dev_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct net_device *netdev; struct usb_8dev_priv *priv; int i, err = -ENOMEM; u32 version; char buf[18]; struct usb_device *usbdev = interface_to_usbdev(intf); /* product id looks strange, better we also check iProduct string */ if (usb_string(usbdev, usbdev->descriptor.iProduct, buf, sizeof(buf)) > 0 && strcmp(buf, "USB2CAN converter")) { dev_info(&usbdev->dev, "ignoring: not an USB2CAN converter\n"); return -ENODEV; } netdev = alloc_candev(sizeof(struct usb_8dev_priv), MAX_TX_URBS); if (!netdev) { dev_err(&intf->dev, "Couldn't alloc candev\n"); return -ENOMEM; } priv = netdev_priv(netdev); priv->udev = usbdev; priv->netdev = netdev; priv->can.state = CAN_STATE_STOPPED; priv->can.clock.freq = USB_8DEV_ABP_CLOCK; priv->can.bittiming_const = &usb_8dev_bittiming_const; priv->can.do_set_mode = usb_8dev_set_mode; priv->can.do_get_berr_counter = usb_8dev_get_berr_counter; priv->can.ctrlmode_supported = CAN_CTRLMODE_LOOPBACK | CAN_CTRLMODE_LISTENONLY | CAN_CTRLMODE_ONE_SHOT | CAN_CTRLMODE_CC_LEN8_DLC; netdev->netdev_ops = &usb_8dev_netdev_ops; netdev->ethtool_ops = &usb_8dev_ethtool_ops; netdev->flags |= IFF_ECHO; /* we support local echo */ init_usb_anchor(&priv->rx_submitted); init_usb_anchor(&priv->tx_submitted); atomic_set(&priv->active_tx_urbs, 0); for (i = 0; i < MAX_TX_URBS; i++) priv->tx_contexts[i].echo_index = MAX_TX_URBS; priv->cmd_msg_buffer = devm_kzalloc(&intf->dev, sizeof(struct usb_8dev_cmd_msg), GFP_KERNEL); if (!priv->cmd_msg_buffer) goto cleanup_candev; usb_set_intfdata(intf, priv); SET_NETDEV_DEV(netdev, &intf->dev); mutex_init(&priv->usb_8dev_cmd_lock); err = register_candev(netdev); if (err) { netdev_err(netdev, "couldn't register CAN device: %d\n", err); goto cleanup_candev; } err = usb_8dev_cmd_version(priv, &version); if (err) { netdev_err(netdev, "can't get firmware version\n"); goto cleanup_unregister_candev; } else { netdev_info(netdev, "firmware: %d.%d, hardware: %d.%d\n", (version>>24) & 0xff, (version>>16) & 0xff, (version>>8) & 0xff, version & 0xff); } return 0; cleanup_unregister_candev: unregister_netdev(priv->netdev); cleanup_candev: free_candev(netdev); return err; } /* Called by the usb core when driver is unloaded or device is removed */ static void usb_8dev_disconnect(struct usb_interface *intf) { struct usb_8dev_priv *priv = usb_get_intfdata(intf); usb_set_intfdata(intf, NULL); if (priv) { netdev_info(priv->netdev, "device disconnected\n"); unregister_netdev(priv->netdev); unlink_all_urbs(priv); free_candev(priv->netdev); } } static struct usb_driver usb_8dev_driver = { .name = KBUILD_MODNAME, .probe = usb_8dev_probe, .disconnect = usb_8dev_disconnect, .id_table = usb_8dev_table, }; module_usb_driver(usb_8dev_driver); MODULE_AUTHOR("Bernd Krumboeck <krumboeck@universalnet.at>"); MODULE_DESCRIPTION("CAN driver for 8 devices USB2CAN interfaces"); MODULE_LICENSE("GPL v2");
3 1 2 1 3 3 3 3 4 3 2 3 1 3 2 2 3 2 1 2 4 3 3 3 2 3 3 1 3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 // SPDX-License-Identifier: GPL-2.0-only /* * TCP Westwood+: end-to-end bandwidth estimation for TCP * * Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4 * * Support at http://c3lab.poliba.it/index.php/Westwood * Main references in literature: * * - Mascolo S, Casetti, M. Gerla et al. * "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001 * * - A. Grieco, s. Mascolo * "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer * Comm. Review, 2004 * * - A. Dell'Aera, L. Grieco, S. Mascolo. * "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving : * A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004 * * Westwood+ employs end-to-end bandwidth measurement to set cwnd and * ssthresh after packet loss. The probing phase is as the original Reno. */ #include <linux/mm.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/inet_diag.h> #include <net/tcp.h> /* TCP Westwood structure */ struct westwood { u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ u32 bw_est; /* bandwidth estimate */ u32 rtt_win_sx; /* here starts a new evaluation... */ u32 bk; u32 snd_una; /* used for evaluating the number of acked bytes */ u32 cumul_ack; u32 accounted; u32 rtt; u32 rtt_min; /* minimum observed RTT */ u8 first_ack; /* flag which infers that this is the first ack */ u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/ }; /* TCP Westwood functions and constants */ #define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ #define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ /* * @tcp_westwood_create * This function initializes fields used in TCP Westwood+, * it is called after the initial SYN, so the sequence numbers * are correct but new passive connections we have no * information about RTTmin at this time so we simply set it to * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative * since in this way we're sure it will be updated in a consistent * way as soon as possible. It will reasonably happen within the first * RTT period of the connection lifetime. */ static void tcp_westwood_init(struct sock *sk) { struct westwood *w = inet_csk_ca(sk); w->bk = 0; w->bw_ns_est = 0; w->bw_est = 0; w->accounted = 0; w->cumul_ack = 0; w->reset_rtt_min = 1; w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; w->rtt_win_sx = tcp_jiffies32; w->snd_una = tcp_sk(sk)->snd_una; w->first_ack = 1; } /* * @westwood_do_filter * Low-pass filter. Implemented using constant coefficients. */ static inline u32 westwood_do_filter(u32 a, u32 b) { return ((7 * a) + b) >> 3; } static void westwood_filter(struct westwood *w, u32 delta) { /* If the filter is empty fill it with the first sample of bandwidth */ if (w->bw_ns_est == 0 && w->bw_est == 0) { w->bw_ns_est = w->bk / delta; w->bw_est = w->bw_ns_est; } else { w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); } } /* * @westwood_pkts_acked * Called after processing group of packets. * but all westwood needs is the last sample of srtt. */ static void tcp_westwood_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct westwood *w = inet_csk_ca(sk); if (sample->rtt_us > 0) w->rtt = usecs_to_jiffies(sample->rtt_us); } /* * @westwood_update_window * It updates RTT evaluation window if it is the right moment to do * it. If so it calls filter for evaluating bandwidth. */ static void westwood_update_window(struct sock *sk) { struct westwood *w = inet_csk_ca(sk); s32 delta = tcp_jiffies32 - w->rtt_win_sx; /* Initialize w->snd_una with the first acked sequence number in order * to fix mismatch between tp->snd_una and w->snd_una for the first * bandwidth sample */ if (w->first_ack) { w->snd_una = tcp_sk(sk)->snd_una; w->first_ack = 0; } /* * See if a RTT-window has passed. * Be careful since if RTT is less than * 50ms we don't filter but we continue 'building the sample'. * This minimum limit was chosen since an estimation on small * time intervals is better to avoid... * Obviously on a LAN we reasonably will always have * right_bound = left_bound + WESTWOOD_RTT_MIN */ if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) { westwood_filter(w, delta); w->bk = 0; w->rtt_win_sx = tcp_jiffies32; } } static inline void update_rtt_min(struct westwood *w) { if (w->reset_rtt_min) { w->rtt_min = w->rtt; w->reset_rtt_min = 0; } else w->rtt_min = min(w->rtt, w->rtt_min); } /* * @westwood_fast_bw * It is called when we are in fast path. In particular it is called when * header prediction is successful. In such case in fact update is * straight forward and doesn't need any particular care. */ static inline void westwood_fast_bw(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); westwood_update_window(sk); w->bk += tp->snd_una - w->snd_una; w->snd_una = tp->snd_una; update_rtt_min(w); } /* * @westwood_acked_count * This function evaluates cumul_ack for evaluating bk in case of * delayed or partial acks. */ static inline u32 westwood_acked_count(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); w->cumul_ack = tp->snd_una - w->snd_una; /* If cumul_ack is 0 this is a dupack since it's not moving * tp->snd_una. */ if (!w->cumul_ack) { w->accounted += tp->mss_cache; w->cumul_ack = tp->mss_cache; } if (w->cumul_ack > tp->mss_cache) { /* Partial or delayed ack */ if (w->accounted >= w->cumul_ack) { w->accounted -= w->cumul_ack; w->cumul_ack = tp->mss_cache; } else { w->cumul_ack -= w->accounted; w->accounted = 0; } } w->snd_una = tp->snd_una; return w->cumul_ack; } /* * TCP Westwood * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 * so avoids ever returning 0. */ static u32 tcp_westwood_bw_rttmin(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct westwood *w = inet_csk_ca(sk); return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } static void tcp_westwood_ack(struct sock *sk, u32 ack_flags) { if (ack_flags & CA_ACK_SLOWPATH) { struct westwood *w = inet_csk_ca(sk); westwood_update_window(sk); w->bk += westwood_acked_count(sk); update_rtt_min(w); return; } westwood_fast_bw(sk); } static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); switch (event) { case CA_EVENT_COMPLETE_CWR: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); tcp_snd_cwnd_set(tp, tp->snd_ssthresh); break; case CA_EVENT_LOSS: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); /* Update RTT_min when next ack arrives */ w->reset_rtt_min = 1; break; default: /* don't care */ break; } } /* Extract info for Tcp socket info provided via netlink. */ static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) { const struct westwood *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { info->vegas.tcpv_enabled = 1; info->vegas.tcpv_rttcnt = 0; info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt); info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min); *attr = INET_DIAG_VEGASINFO; return sizeof(struct tcpvegas_info); } return 0; } static struct tcp_congestion_ops tcp_westwood __read_mostly = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .undo_cwnd = tcp_reno_undo_cwnd, .cwnd_event = tcp_westwood_event, .in_ack_event = tcp_westwood_ack, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, .owner = THIS_MODULE, .name = "westwood" }; static int __init tcp_westwood_register(void) { BUILD_BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_westwood); } static void __exit tcp_westwood_unregister(void) { tcp_unregister_congestion_control(&tcp_westwood); } module_init(tcp_westwood_register); module_exit(tcp_westwood_unregister); MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TCP Westwood+");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> */ #ifndef __XFS_RMAP_ITEM_H__ #define __XFS_RMAP_ITEM_H__ /* * There are (currently) three pairs of rmap btree redo item types: map, unmap, * and convert. The common abbreviations for these are RUI (rmap update * intent) and RUD (rmap update done). The redo item type is encoded in the * flags field of each xfs_map_extent. * * *I items should be recorded in the *first* of a series of rolled * transactions, and the *D items should be recorded in the same transaction * that records the associated rmapbt updates. Typically, the first * transaction will record a bmbt update, followed by some number of * transactions containing rmapbt updates, and finally transactions with any * bnobt/cntbt updates. * * Should the system crash after the commit of the first transaction but * before the commit of the final transaction in a series, log recovery will * use the redo information recorded by the intent items to replay the * (rmapbt/bnobt/cntbt) metadata updates in the non-first transaction. */ /* kernel only RUI/RUD definitions */ struct xfs_mount; struct kmem_cache; /* * Max number of extents in fast allocation path. */ #define XFS_RUI_MAX_FAST_EXTENTS 16 /* * This is the "rmap update intent" log item. It is used to log the fact that * some reverse mappings need to change. It is used in conjunction with the * "rmap update done" log item described below. * * These log items follow the same rules as struct xfs_efi_log_item; see the * comments about that structure (in xfs_extfree_item.h) for more details. */ struct xfs_rui_log_item { struct xfs_log_item rui_item; atomic_t rui_refcount; atomic_t rui_next_extent; struct xfs_rui_log_format rui_format; }; static inline size_t xfs_rui_log_item_sizeof( unsigned int nr) { return offsetof(struct xfs_rui_log_item, rui_format) + xfs_rui_log_format_sizeof(nr); } /* * This is the "rmap update done" log item. It is used to log the fact that * some rmapbt updates mentioned in an earlier rui item have been performed. */ struct xfs_rud_log_item { struct xfs_log_item rud_item; struct xfs_rui_log_item *rud_ruip; struct xfs_rud_log_format rud_format; }; extern struct kmem_cache *xfs_rui_cache; extern struct kmem_cache *xfs_rud_cache; struct xfs_rmap_intent; void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri); unsigned int xfs_rui_log_space(unsigned int nr); unsigned int xfs_rud_log_space(void); #endif /* __XFS_RMAP_ITEM_H__ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. */ #ifndef MEM_H_MODULE #define MEM_H_MODULE /*-**************************************** * Dependencies ******************************************/ #include <linux/unaligned.h> /* get_unaligned, put_unaligned* */ #include <linux/compiler.h> /* inline */ #include <linux/swab.h> /* swab32, swab64 */ #include <linux/types.h> /* size_t, ptrdiff_t */ #include "debug.h" /* DEBUG_STATIC_ASSERT */ /*-**************************************** * Compiler specifics ******************************************/ #undef MEM_STATIC /* may be already defined from common/compiler.h */ #define MEM_STATIC static inline /*-************************************************************** * Basic Types *****************************************************************/ typedef uint8_t BYTE; typedef uint8_t U8; typedef int8_t S8; typedef uint16_t U16; typedef int16_t S16; typedef uint32_t U32; typedef int32_t S32; typedef uint64_t U64; typedef int64_t S64; /*-************************************************************** * Memory I/O API *****************************************************************/ /*=== Static platform detection ===*/ MEM_STATIC unsigned MEM_32bits(void); MEM_STATIC unsigned MEM_64bits(void); MEM_STATIC unsigned MEM_isLittleEndian(void); /*=== Native unaligned read/write ===*/ MEM_STATIC U16 MEM_read16(const void* memPtr); MEM_STATIC U32 MEM_read32(const void* memPtr); MEM_STATIC U64 MEM_read64(const void* memPtr); MEM_STATIC size_t MEM_readST(const void* memPtr); MEM_STATIC void MEM_write16(void* memPtr, U16 value); MEM_STATIC void MEM_write32(void* memPtr, U32 value); MEM_STATIC void MEM_write64(void* memPtr, U64 value); /*=== Little endian unaligned read/write ===*/ MEM_STATIC U16 MEM_readLE16(const void* memPtr); MEM_STATIC U32 MEM_readLE24(const void* memPtr); MEM_STATIC U32 MEM_readLE32(const void* memPtr); MEM_STATIC U64 MEM_readLE64(const void* memPtr); MEM_STATIC size_t MEM_readLEST(const void* memPtr); MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val); MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val); MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32); MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64); MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val); /*=== Big endian unaligned read/write ===*/ MEM_STATIC U32 MEM_readBE32(const void* memPtr); MEM_STATIC U64 MEM_readBE64(const void* memPtr); MEM_STATIC size_t MEM_readBEST(const void* memPtr); MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32); MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64); MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val); /*=== Byteswap ===*/ MEM_STATIC U32 MEM_swap32(U32 in); MEM_STATIC U64 MEM_swap64(U64 in); MEM_STATIC size_t MEM_swapST(size_t in); /*-************************************************************** * Memory I/O Implementation *****************************************************************/ MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t) == 4; } MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t) == 8; } #if defined(__LITTLE_ENDIAN) #define MEM_LITTLE_ENDIAN 1 #else #define MEM_LITTLE_ENDIAN 0 #endif MEM_STATIC unsigned MEM_isLittleEndian(void) { return MEM_LITTLE_ENDIAN; } MEM_STATIC U16 MEM_read16(const void *memPtr) { return get_unaligned((const U16 *)memPtr); } MEM_STATIC U32 MEM_read32(const void *memPtr) { return get_unaligned((const U32 *)memPtr); } MEM_STATIC U64 MEM_read64(const void *memPtr) { return get_unaligned((const U64 *)memPtr); } MEM_STATIC size_t MEM_readST(const void *memPtr) { return get_unaligned((const size_t *)memPtr); } MEM_STATIC void MEM_write16(void *memPtr, U16 value) { put_unaligned(value, (U16 *)memPtr); } MEM_STATIC void MEM_write32(void *memPtr, U32 value) { put_unaligned(value, (U32 *)memPtr); } MEM_STATIC void MEM_write64(void *memPtr, U64 value) { put_unaligned(value, (U64 *)memPtr); } /*=== Little endian r/w ===*/ MEM_STATIC U16 MEM_readLE16(const void *memPtr) { return get_unaligned_le16(memPtr); } MEM_STATIC void MEM_writeLE16(void *memPtr, U16 val) { put_unaligned_le16(val, memPtr); } MEM_STATIC U32 MEM_readLE24(const void *memPtr) { return MEM_readLE16(memPtr) + (((const BYTE *)memPtr)[2] << 16); } MEM_STATIC void MEM_writeLE24(void *memPtr, U32 val) { MEM_writeLE16(memPtr, (U16)val); ((BYTE *)memPtr)[2] = (BYTE)(val >> 16); } MEM_STATIC U32 MEM_readLE32(const void *memPtr) { return get_unaligned_le32(memPtr); } MEM_STATIC void MEM_writeLE32(void *memPtr, U32 val32) { put_unaligned_le32(val32, memPtr); } MEM_STATIC U64 MEM_readLE64(const void *memPtr) { return get_unaligned_le64(memPtr); } MEM_STATIC void MEM_writeLE64(void *memPtr, U64 val64) { put_unaligned_le64(val64, memPtr); } MEM_STATIC size_t MEM_readLEST(const void *memPtr) { if (MEM_32bits()) return (size_t)MEM_readLE32(memPtr); else return (size_t)MEM_readLE64(memPtr); } MEM_STATIC void MEM_writeLEST(void *memPtr, size_t val) { if (MEM_32bits()) MEM_writeLE32(memPtr, (U32)val); else MEM_writeLE64(memPtr, (U64)val); } /*=== Big endian r/w ===*/ MEM_STATIC U32 MEM_readBE32(const void *memPtr) { return get_unaligned_be32(memPtr); } MEM_STATIC void MEM_writeBE32(void *memPtr, U32 val32) { put_unaligned_be32(val32, memPtr); } MEM_STATIC U64 MEM_readBE64(const void *memPtr) { return get_unaligned_be64(memPtr); } MEM_STATIC void MEM_writeBE64(void *memPtr, U64 val64) { put_unaligned_be64(val64, memPtr); } MEM_STATIC size_t MEM_readBEST(const void *memPtr) { if (MEM_32bits()) return (size_t)MEM_readBE32(memPtr); else return (size_t)MEM_readBE64(memPtr); } MEM_STATIC void MEM_writeBEST(void *memPtr, size_t val) { if (MEM_32bits()) MEM_writeBE32(memPtr, (U32)val); else MEM_writeBE64(memPtr, (U64)val); } MEM_STATIC U32 MEM_swap32(U32 in) { return swab32(in); } MEM_STATIC U64 MEM_swap64(U64 in) { return swab64(in); } MEM_STATIC size_t MEM_swapST(size_t in) { if (MEM_32bits()) return (size_t)MEM_swap32((U32)in); else return (size_t)MEM_swap64((U64)in); } #endif /* MEM_H_MODULE */
55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/dir.c * * Copyright (C) 1992 Rick Sladkey * * nfs directory handling functions * * 10 Apr 1996 Added silly rename for unlink --okir * 28 Sep 1996 Improved directory cache --okir * 23 Aug 1997 Claus Heine claus@momo.math.rwth-aachen.de * Re-implemented silly rename for unlink, newly implemented * silly rename for nfs_rename() following the suggestions * of Olaf Kirch (okir) found in this file. * Following Linus comments on my original hack, this version * depends only on the dcache stuff and doesn't touch the inode * layer (iput() and friends). * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM */ #include <linux/compat.h> #include <linux/module.h> #include <linux/time.h> #include <linux/errno.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/swap.h> #include <linux/sched.h> #include <linux/kmemleak.h> #include <linux/xattr.h> #include <linux/hash.h> #include "delegation.h" #include "iostat.h" #include "internal.h" #include "fscache.h" #include "nfstrace.h" /* #define NFS_DEBUG_VERBOSE 1 */ static int nfs_opendir(struct inode *, struct file *); static int nfs_closedir(struct inode *, struct file *); static int nfs_readdir(struct file *, struct dir_context *); static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); static void nfs_readdir_clear_array(struct folio *); static int nfs_do_create(struct inode *dir, struct dentry *dentry, umode_t mode, int open_flags); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, .read = generic_read_dir, .iterate_shared = nfs_readdir, .open = nfs_opendir, .release = nfs_closedir, .fsync = nfs_fsync_dir, }; const struct address_space_operations nfs_dir_aops = { .free_folio = nfs_readdir_clear_array, }; #define NFS_INIT_DTSIZE PAGE_SIZE static struct nfs_open_dir_context * alloc_nfs_open_dir_context(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); if (ctx != NULL) { ctx->attr_gencount = nfsi->attr_gencount; ctx->dtsize = NFS_INIT_DTSIZE; spin_lock(&dir->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) nfs_set_cache_invalid(dir, NFS_INO_INVALID_DATA | NFS_INO_REVAL_FORCED); list_add_tail_rcu(&ctx->list, &nfsi->open_files); memcpy(ctx->verf, nfsi->cookieverf, sizeof(ctx->verf)); spin_unlock(&dir->i_lock); return ctx; } return ERR_PTR(-ENOMEM); } static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx) { spin_lock(&dir->i_lock); list_del_rcu(&ctx->list); spin_unlock(&dir->i_lock); kfree_rcu(ctx, rcu_head); } /* * Open file */ static int nfs_opendir(struct inode *inode, struct file *filp) { int res = 0; struct nfs_open_dir_context *ctx; dfprintk(FILE, "NFS: open dir(%pD2)\n", filp); nfs_inc_stats(inode, NFSIOS_VFSOPEN); ctx = alloc_nfs_open_dir_context(inode); if (IS_ERR(ctx)) { res = PTR_ERR(ctx); goto out; } filp->private_data = ctx; out: return res; } static int nfs_closedir(struct inode *inode, struct file *filp) { put_nfs_open_dir_context(file_inode(filp), filp->private_data); return 0; } struct nfs_cache_array_entry { u64 cookie; u64 ino; const char *name; unsigned int name_len; unsigned char d_type; }; struct nfs_cache_array { u64 change_attr; u64 last_cookie; unsigned int size; unsigned char folio_full : 1, folio_is_eof : 1, cookies_are_ordered : 1; struct nfs_cache_array_entry array[] __counted_by(size); }; struct nfs_readdir_descriptor { struct file *file; struct folio *folio; struct dir_context *ctx; pgoff_t folio_index; pgoff_t folio_index_max; u64 dir_cookie; u64 last_cookie; loff_t current_index; __be32 verf[NFS_DIR_VERIFIER_SIZE]; unsigned long dir_verifier; unsigned long timestamp; unsigned long gencount; unsigned long attr_gencount; unsigned int cache_entry_index; unsigned int buffer_fills; unsigned int dtsize; bool clear_cache; bool plus; bool eob; bool eof; }; static void nfs_set_dtsize(struct nfs_readdir_descriptor *desc, unsigned int sz) { struct nfs_server *server = NFS_SERVER(file_inode(desc->file)); unsigned int maxsize = server->dtsize; if (sz > maxsize) sz = maxsize; if (sz < NFS_MIN_FILE_IO_SIZE) sz = NFS_MIN_FILE_IO_SIZE; desc->dtsize = sz; } static void nfs_shrink_dtsize(struct nfs_readdir_descriptor *desc) { nfs_set_dtsize(desc, desc->dtsize >> 1); } static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc) { nfs_set_dtsize(desc, desc->dtsize << 1); } static void nfs_readdir_folio_init_array(struct folio *folio, u64 last_cookie, u64 change_attr) { struct nfs_cache_array *array; array = kmap_local_folio(folio, 0); array->change_attr = change_attr; array->last_cookie = last_cookie; array->size = 0; array->folio_full = 0; array->folio_is_eof = 0; array->cookies_are_ordered = 1; kunmap_local(array); } /* * we are freeing strings created by nfs_add_to_readdir_array() */ static void nfs_readdir_clear_array(struct folio *folio) { struct nfs_cache_array *array; unsigned int i; array = kmap_local_folio(folio, 0); for (i = 0; i < array->size; i++) kfree(array->array[i].name); array->size = 0; kunmap_local(array); } static void nfs_readdir_folio_reinit_array(struct folio *folio, u64 last_cookie, u64 change_attr) { nfs_readdir_clear_array(folio); nfs_readdir_folio_init_array(folio, last_cookie, change_attr); } static struct folio * nfs_readdir_folio_array_alloc(u64 last_cookie, gfp_t gfp_flags) { struct folio *folio = folio_alloc(gfp_flags, 0); if (folio) nfs_readdir_folio_init_array(folio, last_cookie, 0); return folio; } static void nfs_readdir_folio_array_free(struct folio *folio) { if (folio) { nfs_readdir_clear_array(folio); folio_put(folio); } } static u64 nfs_readdir_array_index_cookie(struct nfs_cache_array *array) { return array->size == 0 ? array->last_cookie : array->array[0].cookie; } static void nfs_readdir_array_set_eof(struct nfs_cache_array *array) { array->folio_is_eof = 1; array->folio_full = 1; } static bool nfs_readdir_array_is_full(struct nfs_cache_array *array) { return array->folio_full; } /* * the caller is responsible for freeing qstr.name * when called by nfs_readdir_add_to_array, the strings will be freed in * nfs_clear_readdir_array() */ static const char *nfs_readdir_copy_name(const char *name, unsigned int len) { const char *ret = kmemdup_nul(name, len, GFP_KERNEL); /* * Avoid a kmemleak false positive. The pointer to the name is stored * in a page cache page which kmemleak does not scan. */ if (ret != NULL) kmemleak_not_leak(ret); return ret; } static size_t nfs_readdir_array_maxentries(void) { return (PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry); } /* * Check that the next array entry lies entirely within the page bounds */ static int nfs_readdir_array_can_expand(struct nfs_cache_array *array) { if (array->folio_full) return -ENOSPC; if (array->size == nfs_readdir_array_maxentries()) { array->folio_full = 1; return -ENOSPC; } return 0; } static int nfs_readdir_folio_array_append(struct folio *folio, const struct nfs_entry *entry, u64 *cookie) { struct nfs_cache_array *array; struct nfs_cache_array_entry *cache_entry; const char *name; int ret = -ENOMEM; name = nfs_readdir_copy_name(entry->name, entry->len); array = kmap_local_folio(folio, 0); if (!name) goto out; ret = nfs_readdir_array_can_expand(array); if (ret) { kfree(name); goto out; } array->size++; cache_entry = &array->array[array->size - 1]; cache_entry->cookie = array->last_cookie; cache_entry->ino = entry->ino; cache_entry->d_type = entry->d_type; cache_entry->name_len = entry->len; cache_entry->name = name; array->last_cookie = entry->cookie; if (array->last_cookie <= cache_entry->cookie) array->cookies_are_ordered = 0; if (entry->eof != 0) nfs_readdir_array_set_eof(array); out: *cookie = array->last_cookie; kunmap_local(array); return ret; } #define NFS_READDIR_COOKIE_MASK (U32_MAX >> 14) /* * Hash algorithm allowing content addressible access to sequences * of directory cookies. Content is addressed by the value of the * cookie index of the first readdir entry in a page. * * We select only the first 18 bits to avoid issues with excessive * memory use for the page cache XArray. 18 bits should allow the caching * of 262144 pages of sequences of readdir entries. Since each page holds * 127 readdir entries for a typical 64-bit system, that works out to a * cache of ~ 33 million entries per directory. */ static pgoff_t nfs_readdir_folio_cookie_hash(u64 cookie) { if (cookie == 0) return 0; return hash_64(cookie, 18); } static bool nfs_readdir_folio_validate(struct folio *folio, u64 last_cookie, u64 change_attr) { struct nfs_cache_array *array = kmap_local_folio(folio, 0); int ret = true; if (array->change_attr != change_attr) ret = false; if (nfs_readdir_array_index_cookie(array) != last_cookie) ret = false; kunmap_local(array); return ret; } static void nfs_readdir_folio_unlock_and_put(struct folio *folio) { folio_unlock(folio); folio_put(folio); } static void nfs_readdir_folio_init_and_validate(struct folio *folio, u64 cookie, u64 change_attr) { if (folio_test_uptodate(folio)) { if (nfs_readdir_folio_validate(folio, cookie, change_attr)) return; nfs_readdir_clear_array(folio); } nfs_readdir_folio_init_array(folio, cookie, change_attr); folio_mark_uptodate(folio); } static struct folio *nfs_readdir_folio_get_locked(struct address_space *mapping, u64 cookie, u64 change_attr) { pgoff_t index = nfs_readdir_folio_cookie_hash(cookie); struct folio *folio; folio = filemap_grab_folio(mapping, index); if (IS_ERR(folio)) return NULL; nfs_readdir_folio_init_and_validate(folio, cookie, change_attr); return folio; } static u64 nfs_readdir_folio_last_cookie(struct folio *folio) { struct nfs_cache_array *array; u64 ret; array = kmap_local_folio(folio, 0); ret = array->last_cookie; kunmap_local(array); return ret; } static bool nfs_readdir_folio_needs_filling(struct folio *folio) { struct nfs_cache_array *array; bool ret; array = kmap_local_folio(folio, 0); ret = !nfs_readdir_array_is_full(array); kunmap_local(array); return ret; } static void nfs_readdir_folio_set_eof(struct folio *folio) { struct nfs_cache_array *array; array = kmap_local_folio(folio, 0); nfs_readdir_array_set_eof(array); kunmap_local(array); } static struct folio *nfs_readdir_folio_get_next(struct address_space *mapping, u64 cookie, u64 change_attr) { pgoff_t index = nfs_readdir_folio_cookie_hash(cookie); struct folio *folio; folio = __filemap_get_folio(mapping, index, FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return NULL; nfs_readdir_folio_init_and_validate(folio, cookie, change_attr); if (nfs_readdir_folio_last_cookie(folio) != cookie) nfs_readdir_folio_reinit_array(folio, cookie, change_attr); return folio; } static inline int is_32bit_api(void) { #ifdef CONFIG_COMPAT return in_compat_syscall(); #else return (BITS_PER_LONG == 32); #endif } static bool nfs_readdir_use_cookie(const struct file *filp) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return false; return true; } static void nfs_readdir_seek_next_array(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { if (array->folio_full) { desc->last_cookie = array->last_cookie; desc->current_index += array->size; desc->cache_entry_index = 0; desc->folio_index++; } else desc->last_cookie = nfs_readdir_array_index_cookie(array); } static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc) { desc->current_index = 0; desc->last_cookie = 0; desc->folio_index = 0; } static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { loff_t diff = desc->ctx->pos - desc->current_index; unsigned int index; if (diff < 0) goto out_eof; if (diff >= array->size) { if (array->folio_is_eof) goto out_eof; nfs_readdir_seek_next_array(array, desc); return -EAGAIN; } index = (unsigned int)diff; desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; return 0; out_eof: desc->eof = true; return -EBADCOOKIE; } static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array, u64 cookie) { if (!array->cookies_are_ordered) return true; /* Optimisation for monotonically increasing cookies */ if (cookie >= array->last_cookie) return false; if (array->size && cookie < array->array[0].cookie) return false; return true; } static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { unsigned int i; int status = -EAGAIN; if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie)) goto check_eof; for (i = 0; i < array->size; i++) { if (array->array[i].cookie == desc->dir_cookie) { if (nfs_readdir_use_cookie(desc->file)) desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos = desc->current_index + i; desc->cache_entry_index = i; return 0; } } check_eof: if (array->folio_is_eof) { status = -EBADCOOKIE; if (desc->dir_cookie == array->last_cookie) desc->eof = true; } else nfs_readdir_seek_next_array(array, desc); return status; } static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc) { struct nfs_cache_array *array; int status; array = kmap_local_folio(desc->folio, 0); if (desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); else status = nfs_readdir_search_for_cookie(array, desc); kunmap_local(array); return status; } /* Fill a page with xdr information before transferring to the cache page */ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc, __be32 *verf, u64 cookie, struct page **pages, size_t bufsize, __be32 *verf_res) { struct inode *inode = file_inode(desc->file); struct nfs_readdir_arg arg = { .dentry = file_dentry(desc->file), .cred = desc->file->f_cred, .verf = verf, .cookie = cookie, .pages = pages, .page_len = bufsize, .plus = desc->plus, }; struct nfs_readdir_res res = { .verf = verf_res, }; unsigned long timestamp, gencount; int error; again: timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); desc->dir_verifier = nfs_save_change_attribute(inode); error = NFS_PROTO(inode)->readdir(&arg, &res); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ if (error == -ENOTSUPP && desc->plus) { NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; desc->plus = arg.plus = false; goto again; } goto error; } desc->timestamp = timestamp; desc->gencount = gencount; error: return error; } static int xdr_decode(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, struct xdr_stream *xdr) { struct inode *inode = file_inode(desc->file); int error; error = NFS_PROTO(inode)->decode_dirent(xdr, entry, desc->plus); if (error) return error; entry->fattr->time_start = desc->timestamp; entry->fattr->gencount = desc->gencount; return 0; } /* Match file and dirent using either filehandle or fileid * Note: caller is responsible for checking the fsid */ static int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { struct inode *inode; struct nfs_inode *nfsi; if (d_really_is_negative(dentry)) return 0; inode = d_inode(dentry); if (is_bad_inode(inode) || NFS_STALE(inode)) return 0; nfsi = NFS_I(inode); if (entry->fattr->fileid != nfsi->fileid) return 0; if (entry->fh->size && nfs_compare_fh(entry->fh, &nfsi->fh) != 0) return 0; return 1; } #define NFS_READDIR_CACHE_USAGE_THRESHOLD (8UL) static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx, unsigned int cache_hits, unsigned int cache_misses) { if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) return false; if (NFS_SERVER(dir)->flags & NFS_MOUNT_FORCE_RDIRPLUS) return true; if (ctx->pos == 0 || cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD) return true; return false; } /* * This function is called by the getattr code to request the * use of readdirplus to accelerate any future lookups in the same * directory. */ void nfs_readdir_record_entry_cache_hit(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && S_ISDIR(dir->i_mode)) { rcu_read_lock(); list_for_each_entry_rcu (ctx, &nfsi->open_files, list) atomic_inc(&ctx->cache_hits); rcu_read_unlock(); } } /* * This function is mainly for use by nfs_getattr(). * * If this is an 'ls -l', we want to force use of readdirplus. */ void nfs_readdir_record_entry_cache_miss(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && S_ISDIR(dir->i_mode)) { rcu_read_lock(); list_for_each_entry_rcu (ctx, &nfsi->open_files, list) atomic_inc(&ctx->cache_misses); rcu_read_unlock(); } } static void nfs_lookup_advise_force_readdirplus(struct inode *dir, unsigned int flags) { if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) return; if (flags & (LOOKUP_EXCL | LOOKUP_PARENT | LOOKUP_REVAL)) return; nfs_readdir_record_entry_cache_miss(dir); } static void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, unsigned long dir_verifier) { struct qstr filename = QSTR_INIT(entry->name, entry->len); DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct dentry *dentry; struct dentry *alias; struct inode *inode; int status; if (!(entry->fattr->valid & NFS_ATTR_FATTR_FILEID)) return; if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID)) return; if (filename.len == 0) return; /* Validate that the name doesn't contain any illegal '\0' */ if (strnlen(filename.name, filename.len) != filename.len) return; /* ...or '/' */ if (strnchr(filename.name, filename.len, '/')) return; if (filename.name[0] == '.') { if (filename.len == 1) return; if (filename.len == 2 && filename.name[1] == '.') return; } filename.hash = full_name_hash(parent, filename.name, filename.len); dentry = d_lookup(parent, &filename); again: if (!dentry) { dentry = d_alloc_parallel(parent, &filename, &wq); if (IS_ERR(dentry)) return; } if (!d_in_lookup(dentry)) { /* Is there a mountpoint here? If so, just exit */ if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid, &entry->fattr->fsid)) goto out; if (nfs_same_file(dentry, entry)) { if (!entry->fh->size) goto out; nfs_set_verifier(dentry, dir_verifier); status = nfs_refresh_inode(d_inode(dentry), entry->fattr); if (!status) nfs_setsecurity(d_inode(dentry), entry->fattr); trace_nfs_readdir_lookup_revalidate(d_inode(parent), dentry, 0, status); goto out; } else { trace_nfs_readdir_lookup_revalidate_failed( d_inode(parent), dentry, 0); d_invalidate(dentry); dput(dentry); dentry = NULL; goto again; } } if (!entry->fh->size) { d_lookup_done(dentry); goto out; } inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); alias = d_splice_alias(inode, dentry); d_lookup_done(dentry); if (alias) { if (IS_ERR(alias)) goto out; dput(dentry); dentry = alias; } nfs_set_verifier(dentry, dir_verifier); trace_nfs_readdir_lookup(d_inode(parent), dentry, 0); out: dput(dentry); } static int nfs_readdir_entry_decode(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, struct xdr_stream *stream) { int ret; if (entry->fattr->label) entry->fattr->label->len = NFS4_MAXLABELLEN; ret = xdr_decode(desc, entry, stream); if (ret || !desc->plus) return ret; nfs_prime_dcache(file_dentry(desc->file), entry, desc->dir_verifier); return 0; } /* Perform conversion from xdr to cache array */ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, struct page **xdr_pages, unsigned int buflen, struct folio **arrays, size_t narrays, u64 change_attr) { struct address_space *mapping = desc->file->f_mapping; struct folio *new, *folio = *arrays; struct xdr_stream stream; struct page *scratch; struct xdr_buf buf; u64 cookie; int status; scratch = alloc_page(GFP_KERNEL); if (scratch == NULL) return -ENOMEM; xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); xdr_set_scratch_page(&stream, scratch); do { status = nfs_readdir_entry_decode(desc, entry, &stream); if (status != 0) break; status = nfs_readdir_folio_array_append(folio, entry, &cookie); if (status != -ENOSPC) continue; if (folio->mapping != mapping) { if (!--narrays) break; new = nfs_readdir_folio_array_alloc(cookie, GFP_KERNEL); if (!new) break; arrays++; *arrays = folio = new; } else { new = nfs_readdir_folio_get_next(mapping, cookie, change_attr); if (!new) break; if (folio != *arrays) nfs_readdir_folio_unlock_and_put(folio); folio = new; } desc->folio_index_max++; status = nfs_readdir_folio_array_append(folio, entry, &cookie); } while (!status && !entry->eof); switch (status) { case -EBADCOOKIE: if (!entry->eof) break; nfs_readdir_folio_set_eof(folio); fallthrough; case -EAGAIN: status = 0; break; case -ENOSPC: status = 0; if (!desc->plus) break; while (!nfs_readdir_entry_decode(desc, entry, &stream)) ; } if (folio != *arrays) nfs_readdir_folio_unlock_and_put(folio); put_page(scratch); return status; } static void nfs_readdir_free_pages(struct page **pages, size_t npages) { while (npages--) put_page(pages[npages]); kfree(pages); } /* * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call * to nfs_readdir_free_pages() */ static struct page **nfs_readdir_alloc_pages(size_t npages) { struct page **pages; size_t i; pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); if (!pages) return NULL; for (i = 0; i < npages; i++) { struct page *page = alloc_page(GFP_KERNEL); if (page == NULL) goto out_freepages; pages[i] = page; } return pages; out_freepages: nfs_readdir_free_pages(pages, i); return NULL; } static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, __be32 *verf_arg, __be32 *verf_res, struct folio **arrays, size_t narrays) { u64 change_attr; struct page **pages; struct folio *folio = *arrays; struct nfs_entry *entry; size_t array_size; struct inode *inode = file_inode(desc->file); unsigned int dtsize = desc->dtsize; unsigned int pglen; int status = -ENOMEM; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; entry->cookie = nfs_readdir_folio_last_cookie(folio); entry->fh = nfs_alloc_fhandle(); entry->fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); entry->server = NFS_SERVER(inode); if (entry->fh == NULL || entry->fattr == NULL) goto out; array_size = (dtsize + PAGE_SIZE - 1) >> PAGE_SHIFT; pages = nfs_readdir_alloc_pages(array_size); if (!pages) goto out; change_attr = inode_peek_iversion_raw(inode); status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, pages, dtsize, verf_res); if (status < 0) goto free_pages; pglen = status; if (pglen != 0) status = nfs_readdir_folio_filler(desc, entry, pages, pglen, arrays, narrays, change_attr); else nfs_readdir_folio_set_eof(folio); desc->buffer_fills++; free_pages: nfs_readdir_free_pages(pages, array_size); out: nfs_free_fattr(entry->fattr); nfs_free_fhandle(entry->fh); kfree(entry); return status; } static void nfs_readdir_folio_put(struct nfs_readdir_descriptor *desc) { folio_put(desc->folio); desc->folio = NULL; } static void nfs_readdir_folio_unlock_and_put_cached(struct nfs_readdir_descriptor *desc) { folio_unlock(desc->folio); nfs_readdir_folio_put(desc); } static struct folio * nfs_readdir_folio_get_cached(struct nfs_readdir_descriptor *desc) { struct address_space *mapping = desc->file->f_mapping; u64 change_attr = inode_peek_iversion_raw(mapping->host); u64 cookie = desc->last_cookie; struct folio *folio; folio = nfs_readdir_folio_get_locked(mapping, cookie, change_attr); if (!folio) return NULL; if (desc->clear_cache && !nfs_readdir_folio_needs_filling(folio)) nfs_readdir_folio_reinit_array(folio, cookie, change_attr); return folio; } /* * Returns 0 if desc->dir_cookie was found on page desc->page_index * and locks the page to prevent removal from the page cache. */ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) { struct inode *inode = file_inode(desc->file); struct nfs_inode *nfsi = NFS_I(inode); __be32 verf[NFS_DIR_VERIFIER_SIZE]; int res; desc->folio = nfs_readdir_folio_get_cached(desc); if (!desc->folio) return -ENOMEM; if (nfs_readdir_folio_needs_filling(desc->folio)) { /* Grow the dtsize if we had to go back for more pages */ if (desc->folio_index == desc->folio_index_max) nfs_grow_dtsize(desc); desc->folio_index_max = desc->folio_index; trace_nfs_readdir_cache_fill(desc->file, nfsi->cookieverf, desc->last_cookie, desc->folio->index, desc->dtsize); res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf, &desc->folio, 1); if (res < 0) { nfs_readdir_folio_unlock_and_put_cached(desc); trace_nfs_readdir_cache_fill_done(inode, res); if (res == -EBADCOOKIE || res == -ENOTSYNC) { invalidate_inode_pages2(desc->file->f_mapping); nfs_readdir_rewind_search(desc); trace_nfs_readdir_invalidate_cache_range( inode, 0, MAX_LFS_FILESIZE); return -EAGAIN; } return res; } /* * Set the cookie verifier if the page cache was empty */ if (desc->last_cookie == 0 && memcmp(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf))) { memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf)); invalidate_inode_pages2_range(desc->file->f_mapping, 1, -1); trace_nfs_readdir_invalidate_cache_range( inode, 1, MAX_LFS_FILESIZE); } desc->clear_cache = false; } res = nfs_readdir_search_array(desc); if (res == 0) return 0; nfs_readdir_folio_unlock_and_put_cached(desc); return res; } /* Search for desc->dir_cookie from the beginning of the page cache */ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) { int res; do { res = find_and_lock_cache_page(desc); } while (res == -EAGAIN); return res; } #define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL) /* * Once we've found the start of the dirent within a page: fill 'er up... */ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, const __be32 *verf) { struct file *file = desc->file; struct nfs_cache_array *array; unsigned int i; bool first_emit = !desc->dir_cookie; array = kmap_local_folio(desc->folio, 0); for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; /* * nfs_readdir_handle_cache_misses return force clear at * (cache_misses > NFS_READDIR_CACHE_MISS_THRESHOLD) for * readdir heuristic, NFS_READDIR_CACHE_MISS_THRESHOLD + 1 * entries need be emitted here. */ if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 2) { desc->eob = true; break; } ent = &array->array[i]; if (!dir_emit(desc->ctx, ent->name, ent->name_len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { desc->eob = true; break; } memcpy(desc->verf, verf, sizeof(desc->verf)); if (i == array->size - 1) { desc->dir_cookie = array->last_cookie; nfs_readdir_seek_next_array(array, desc); } else { desc->dir_cookie = array->array[i + 1].cookie; desc->last_cookie = array->array[0].cookie; } if (nfs_readdir_use_cookie(file)) desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos++; } if (array->folio_is_eof) desc->eof = !desc->eob; kunmap_local(array); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n", (unsigned long long)desc->dir_cookie); } /* * If we cannot find a cookie in our cache, we suspect that this is * because it points to a deleted file, so we ask the server to return * whatever it thinks is the next entry. We then feed this to filldir. * If all goes well, we should then be able to find our way round the * cache on the next call to readdir_search_pagecache(); * * NOTE: we cannot add the anonymous page to the pagecache because * the data it contains might not be page aligned. Besides, * we should already have a complete representation of the * directory in the page cache by the time we get here. */ static int uncached_readdir(struct nfs_readdir_descriptor *desc) { struct folio **arrays; size_t i, sz = 512; __be32 verf[NFS_DIR_VERIFIER_SIZE]; int status = -ENOMEM; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %llu\n", (unsigned long long)desc->dir_cookie); arrays = kcalloc(sz, sizeof(*arrays), GFP_KERNEL); if (!arrays) goto out; arrays[0] = nfs_readdir_folio_array_alloc(desc->dir_cookie, GFP_KERNEL); if (!arrays[0]) goto out; desc->folio_index = 0; desc->cache_entry_index = 0; desc->last_cookie = desc->dir_cookie; desc->folio_index_max = 0; trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie, -1, desc->dtsize); status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz); if (status < 0) { trace_nfs_readdir_uncached_done(file_inode(desc->file), status); goto out_free; } for (i = 0; !desc->eob && i < sz && arrays[i]; i++) { desc->folio = arrays[i]; nfs_do_filldir(desc, verf); } desc->folio = NULL; /* * Grow the dtsize if we have to go back for more pages, * or shrink it if we're reading too many. */ if (!desc->eof) { if (!desc->eob) nfs_grow_dtsize(desc); else if (desc->buffer_fills == 1 && i < (desc->folio_index_max >> 1)) nfs_shrink_dtsize(desc); } out_free: for (i = 0; i < sz && arrays[i]; i++) nfs_readdir_folio_array_free(arrays[i]); out: if (!nfs_readdir_use_cookie(desc->file)) nfs_readdir_rewind_search(desc); desc->folio_index_max = -1; kfree(arrays); dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); return status; } static bool nfs_readdir_handle_cache_misses(struct inode *inode, struct nfs_readdir_descriptor *desc, unsigned int cache_misses, bool force_clear) { if (desc->ctx->pos == 0 || !desc->plus) return false; if (cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD && !force_clear) return false; trace_nfs_readdir_force_readdirplus(inode); return true; } /* The file offset position represents the dirent entry number. A last cookie cache takes care of the common case of reading the whole directory. */ static int nfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_dir_context *dir_ctx = file->private_data; struct nfs_readdir_descriptor *desc; unsigned int cache_hits, cache_misses; bool force_clear; int res; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", file, (long long)ctx->pos); nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); /* * ctx->pos points to the dirent entry number. * *desc->dir_cookie has the cookie for the next entry. We have * to either find the entry with the appropriate number or * revalidate the cookie. */ nfs_revalidate_mapping(inode, file->f_mapping); res = -ENOMEM; desc = kzalloc(sizeof(*desc), GFP_KERNEL); if (!desc) goto out; desc->file = file; desc->ctx = ctx; desc->folio_index_max = -1; spin_lock(&file->f_lock); desc->dir_cookie = dir_ctx->dir_cookie; desc->folio_index = dir_ctx->page_index; desc->last_cookie = dir_ctx->last_cookie; desc->attr_gencount = dir_ctx->attr_gencount; desc->eof = dir_ctx->eof; nfs_set_dtsize(desc, dir_ctx->dtsize); memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); cache_hits = atomic_xchg(&dir_ctx->cache_hits, 0); cache_misses = atomic_xchg(&dir_ctx->cache_misses, 0); force_clear = dir_ctx->force_clear; spin_unlock(&file->f_lock); if (desc->eof) { res = 0; goto out_free; } desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits,